From 284e2d13e34b55f3e62085d04c072a746bf9fc9f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 13:51:47 -0700
Subject: [PATCH 01/17] Add Array and ArrayFormat to cuda.core (refs #467)

Introduce a Pythonic wrapper around CUarray as a prerequisite for
TextureObject / SurfaceObject support. This initial slice covers plain
1D/2D/3D allocations via cuArrayCreate / cuArray3DCreate, with an opt-in
surface_load_store flag for binding as a SurfaceObject. Layered, cubemap,
sparse, and texture-gather variants are intentionally deferred.

_from_handle is provided for graphics-interop borrowing and queries shape,
format, and channel count from the driver.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py |   1 +
 cuda_core/cuda/core/_array.pxd  |  20 +++
 cuda_core/cuda/core/_array.pyx  | 238 ++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 cuda_core/cuda/core/_array.pxd
 create mode 100644 cuda_core/cuda/core/_array.pyx

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index f2d7c85b62e..17395a5fb83 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -78,6 +78,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResource,
     WorkqueueResourceOptions,
 )
+from cuda.core._array import Array, ArrayFormat
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
 from cuda.core._launch_config import LaunchConfig
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
new file mode 100644
index 00000000000..9b6e1dad5bd
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pxd
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class Array:
+
+    cdef:
+        cydriver.CUarray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        int _format                  # CUarray_format value
+        unsigned int _num_channels   # 1, 2, or 4
+        int _device_id
+        intptr_t _context
+        bint _owning
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
new file mode 100644
index 00000000000..86d599f48b5
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pyx
@@ -0,0 +1,238 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import enum
+
+
+class ArrayFormat(enum.IntEnum):
+    """Element format for a :class:`Array` allocation.
+
+    Mirrors ``CUarray_format`` from the CUDA driver API.
+    """
+    UINT8   = cydriver.CU_AD_FORMAT_UNSIGNED_INT8
+    UINT16  = cydriver.CU_AD_FORMAT_UNSIGNED_INT16
+    UINT32  = cydriver.CU_AD_FORMAT_UNSIGNED_INT32
+    INT8    = cydriver.CU_AD_FORMAT_SIGNED_INT8
+    INT16   = cydriver.CU_AD_FORMAT_SIGNED_INT16
+    INT32   = cydriver.CU_AD_FORMAT_SIGNED_INT32
+    FLOAT16 = cydriver.CU_AD_FORMAT_HALF
+    FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT
+
+
+# Bytes per element (single channel) for each format.
+_FORMAT_ELEM_SIZE = {
+    int(ArrayFormat.UINT8):   1,
+    int(ArrayFormat.INT8):    1,
+    int(ArrayFormat.UINT16):  2,
+    int(ArrayFormat.INT16):   2,
+    int(ArrayFormat.FLOAT16): 2,
+    int(ArrayFormat.UINT32):  4,
+    int(ArrayFormat.INT32):   4,
+    int(ArrayFormat.FLOAT32): 4,
+}
+
+
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("Array allocation requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
+cdef class Array:
+    """An opaque, hardware-laid-out GPU allocation for texture/surface access.
+
+    Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
+    and can only be accessed from kernels through a :class:`TextureObject` or
+    :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D
+    spatial locality.
+
+    Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are
+    supported in this initial version; layered/cubemap/sparse variants will
+    follow once their shape semantics are settled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "Array cannot be instantiated directly. Use Array.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False):
+        """Allocate a new CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
+            can be bound as a :class:`SurfaceObject` for kernel-side writes.
+            Default False.
+
+        Returns
+        -------
+        Array
+        """
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format)}")
+        if num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+
+        try:
+            shape_t = tuple(int(s) for s in shape)
+        except TypeError as e:
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape)}") from e
+        if not 1 <= len(shape_t) <= 3:
+            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+        for i, dim in enumerate(shape_t):
+            if dim < 1:
+                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+
+        cdef Array self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = int(format)
+        self._num_channels = num_channels
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+        )
+
+        # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor
+        # whenever any flag is set or shape is 3D.
+        if rank == 3 or flags != 0:
+            memset(&desc3d, 0, sizeof(desc3d))
+            desc3d.Width = <size_t>shape_t[0]
+            desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+            desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+            desc3d.Format = c_format
+            desc3d.NumChannels = <unsigned int>num_channels
+            desc3d.Flags = flags
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArray3DCreate(&self._handle, &desc3d))
+        else:
+            memset(&desc2d, 0, sizeof(desc2d))
+            desc2d.Width = <size_t>shape_t[0]
+            desc2d.Height = <size_t>(shape_t[1] if rank == 2 else 0)
+            desc2d.Format = c_format
+            desc2d.NumChannels = <unsigned int>num_channels
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArrayCreate(&self._handle, &desc2d))
+
+        return self
+
+    @classmethod
+    def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None):
+        """Wrap an externally-allocated ``CUarray``.
+
+        Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``)
+        where the array is owned by the graphics API. With ``owning=False``,
+        :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
+        format, and channel count are queried from the driver.
+        """
+        cdef Array self = cls.__new__(cls)
+        self._handle = <cydriver.CUarray><void*>handle
+        self._owning = owning
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id() if device_id is None else int(device_id)
+
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc
+        with nogil:
+            HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, self._handle))
+
+        if desc.Depth > 0:
+            self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth))
+        elif desc.Height > 0:
+            self._shape = (int(desc.Width), int(desc.Height))
+        else:
+            self._shape = (int(desc.Width),)
+        self._format = <int>desc.Format
+        self._num_channels = desc.NumChannels
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUarray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def element_size(self):
+        """Bytes per element (format size * channels)."""
+        return _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+
+    @property
+    def device(self):
+        """The :class:`Device` this array was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUarray`` if owned by this object."""
+        if self._handle != NULL and self._owning:
+            HANDLE_RETURN(cydriver.cuArrayDestroy(self._handle))
+        self._handle = NULL
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuArrayDestroy error here is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"Array(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels})"
+        )

From a6798856c5720cd70114bcbc9714ed6c45852bee Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 14:08:22 -0700
Subject: [PATCH 02/17] Add copy_from / copy_to to cuda.core.Array (refs #467)

Full-array async copies between an Array and either a Buffer or any
buffer-protocol host object (numpy, bytes, bytearray, array.array).
Implemented as a single cuMemcpy3DAsync path so 1D/2D/3D arrays share
one code path.

Also exposes a size_bytes property used to size matching host or device
buffers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_array.pyx | 170 +++++++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)

diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 86d599f48b5..3d1373b3023 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -4,10 +4,13 @@
 
 from __future__ import annotations
 
+cimport cpython
 from libc.stdint cimport intptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._stream cimport Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import enum
@@ -57,6 +60,128 @@ cdef inline int _get_current_device_id() except -1:
     return <int>dev
 
 
+cdef void _fill_array_endpoint(
+    cydriver.CUDA_MEMCPY3D* p, Array arr, bint is_src
+) noexcept:
+    """Populate the src or dst array fields of a CUDA_MEMCPY3D struct."""
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.srcArray = arr._handle
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.dstArray = arr._handle
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+
+
+cdef int _fill_linear_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate the src or dst linear fields. Returns 1 if pybuf_out was
+    filled (caller must release it), 0 otherwise.
+    """
+    cdef intptr_t ptr
+    cdef int got_buffer = 0
+    if isinstance(obj, Buffer):
+        ptr = int((<Buffer>obj).handle)
+        if is_src:
+            p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.srcDevice = <cydriver.CUdeviceptr>ptr
+            p.srcPitch = width_bytes
+            p.srcHeight = height
+            p.srcXInBytes = 0
+            p.srcY = 0
+            p.srcZ = 0
+        else:
+            p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.dstDevice = <cydriver.CUdeviceptr>ptr
+            p.dstPitch = width_bytes
+            p.dstHeight = height
+            p.dstXInBytes = 0
+            p.dstY = 0
+            p.dstZ = 0
+        return 0
+
+    # Treat anything else as a host buffer via the Python buffer protocol.
+    cdef int flags = cpython.PyBUF_SIMPLE
+    if not is_src:
+        flags |= cpython.PyBUF_WRITABLE
+    if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0:
+        raise TypeError(
+            f"Source/destination must be a Buffer or a contiguous "
+            f"buffer-protocol object, got {type(obj).__name__}"
+        )
+    got_buffer = 1
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.srcHost = pybuf_out.buf
+        p.srcPitch = width_bytes
+        p.srcHeight = height
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.dstHost = pybuf_out.buf
+        p.dstPitch = width_bytes
+        p.dstHeight = height
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+    return 1
+
+
+cdef _copy3d(Array arr, object other, object stream, bint to_array):
+    """Issue a full-array async 3D memcpy between ``arr`` and ``other``.
+
+    Direction is determined by ``to_array``: True copies *into* arr, False
+    copies *out of* arr.
+    """
+    cdef cydriver.CUDA_MEMCPY3D params
+    cdef cpython.Py_buffer pybuf
+    cdef int got_buffer = 0
+    cdef intptr_t stream_handle
+    cdef cydriver.CUstream c_stream
+
+    if not isinstance(stream, Stream):
+        raise TypeError(f"stream must be a Stream, got {type(stream).__name__}")
+
+    memset(&params, 0, sizeof(params))
+    width_bytes, height, depth = arr._extent_bytes()
+    params.WidthInBytes = <size_t>width_bytes
+    params.Height = <size_t>height
+    params.Depth = <size_t>depth
+
+    try:
+        if to_array:
+            got_buffer = _fill_linear_endpoint(
+                &params, other, True, width_bytes, height, &pybuf
+            )
+            _fill_array_endpoint(&params, arr, False)
+        else:
+            _fill_array_endpoint(&params, arr, True)
+            got_buffer = _fill_linear_endpoint(
+                &params, other, False, width_bytes, height, &pybuf
+            )
+
+        stream_handle = int((<Stream>stream).handle)
+        c_stream = <cydriver.CUstream><void*>stream_handle
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemcpy3DAsync(&params, c_stream))
+    finally:
+        if got_buffer:
+            cpython.PyBuffer_Release(&pybuf)
+
+
 cdef class Array:
     """An opaque, hardware-laid-out GPU allocation for texture/surface access.
 
@@ -211,6 +336,51 @@ cdef class Array:
         from cuda.core._device import Device
         return Device(self._device_id)
 
+    def _extent_bytes(self):
+        """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
+        normalized to >=1 for lower-rank arrays."""
+        cdef int rank = len(self._shape)
+        cdef size_t w = <size_t>self._shape[0] * <size_t>(
+            _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+        )
+        cdef size_t h = <size_t>(self._shape[1] if rank >= 2 else 1)
+        cdef size_t d = <size_t>(self._shape[2] if rank >= 3 else 1)
+        return w, h, d
+
+    def copy_from(self, src, *, stream):
+        """Copy a full-array's worth of data into this array.
+
+        Parameters
+        ----------
+        src : Buffer or buffer-protocol object
+            Source data. Must contain at least ``self.size_bytes`` bytes
+            of contiguous data.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, src, stream, to_array=True)
+
+    def copy_to(self, dst, *, stream):
+        """Copy a full-array's worth of data out of this array.
+
+        Parameters
+        ----------
+        dst : Buffer or writable buffer-protocol object
+            Destination. Must have at least ``self.size_bytes`` bytes of
+            writable, contiguous space.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, dst, stream, to_array=False)
+
+    @property
+    def size_bytes(self):
+        """Total bytes of array storage (``prod(shape) * element_size``)."""
+        cdef size_t n = 1
+        for s in self._shape:
+            n *= <size_t>s
+        return n * <size_t>(_FORMAT_ELEM_SIZE[self._format] * self._num_channels)
+
     cpdef close(self):
         """Destroy the underlying ``CUarray`` if owned by this object."""
         if self._handle != NULL and self._owning:

From a4a5d5e95e2f6c6d3c3346ae8196f6d4a46593ee Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 14:29:51 -0700
Subject: [PATCH 03/17] Add TextureObject and
 ResourceDescriptor/TextureDescriptor (refs #467)

Wraps cuTexObjectCreate with a Pythonic descriptor pair:

- ResourceDescriptor.from_array(array) is the only resource kind supported
  in this initial slice; from_linear and from_pitch2d will follow once
  Buffer carries format/channel metadata.
- TextureDescriptor mirrors CUDA_TEXTURE_DESC: per-axis AddressMode,
  FilterMode, ReadMode, normalized coords, sRGB, border color, mipmap
  params, anisotropy.
- TextureObject holds a strong ref to the ResourceDescriptor (and
  transitively the backing Array) for the lifetime of the handle to
  prevent dangling-pointer kernel launches.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py  |   8 +
 cuda_core/cuda/core/_texture.pxd |  18 ++
 cuda_core/cuda/core/_texture.pyx | 338 +++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+)
 create mode 100644 cuda_core/cuda/core/_texture.pxd
 create mode 100644 cuda_core/cuda/core/_texture.pyx

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 17395a5fb83..0f84d79de53 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -79,6 +79,14 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResourceOptions,
 )
 from cuda.core._array import Array, ArrayFormat
+from cuda.core._texture import (
+    AddressMode,
+    FilterMode,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+)
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
 from cuda.core._launch_config import LaunchConfig
diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd
new file mode 100644
index 00000000000..4d2d5004069
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pxd
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class TextureObject:
+
+    cdef:
+        cydriver.CUtexObject _handle
+        object _source_ref      # keep backing Array (or other resource) alive
+        object _texture_desc    # original TextureDescriptor for introspection
+        int _device_id
+        intptr_t _context
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
new file mode 100644
index 00000000000..7f869c47782
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -0,0 +1,338 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import enum
+from dataclasses import dataclass, field
+
+
+# Driver texture-descriptor flag bits (CU_TRSF_*).
+_TRSF_READ_AS_INTEGER = 0x01
+_TRSF_NORMALIZED_COORDINATES = 0x02
+_TRSF_SRGB = 0x10
+_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20
+_TRSF_SEAMLESS_CUBEMAP = 0x40
+
+
+class AddressMode(enum.IntEnum):
+    """Boundary behavior for out-of-range texture coordinates."""
+    WRAP   = cydriver.CU_TR_ADDRESS_MODE_WRAP
+    CLAMP  = cydriver.CU_TR_ADDRESS_MODE_CLAMP
+    MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR
+    BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
+
+
+class FilterMode(enum.IntEnum):
+    """Texel sampling mode."""
+    POINT  = cydriver.CU_TR_FILTER_MODE_POINT
+    LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
+
+
+class ReadMode(enum.IntEnum):
+    """How sampled values are returned to the kernel.
+
+    - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
+      integer, float stays float).
+    - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized
+      ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed).
+      Float formats are unaffected.
+    """
+    ELEMENT_TYPE     = 0
+    NORMALIZED_FLOAT = 1
+
+
+class ResourceDescriptor:
+    """Describes the memory backing a :class:`TextureObject`.
+
+    Construct via the ``from_*`` classmethods. Only the ``from_array`` path is
+    implemented in this initial version; ``from_linear`` and ``from_pitch2d``
+    will follow once their metadata story (format/channel count on
+    :class:`Buffer`) is settled.
+    """
+
+    __slots__ = ("_kind", "_source")
+
+    def __init__(self):
+        raise RuntimeError(
+            "ResourceDescriptor cannot be instantiated directly. "
+            "Use ResourceDescriptor.from_* factories."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Build a resource descriptor backed by a :class:`Array`."""
+        if not isinstance(array, Array):
+            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        self = cls.__new__(cls)
+        self._kind = "array"
+        self._source = array
+        return self
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def source(self):
+        return self._source
+
+    def __repr__(self):
+        return f"ResourceDescriptor(kind={self._kind!r})"
+
+
+@dataclass
+class TextureDescriptor:
+    """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``).
+
+    Attributes
+    ----------
+    address_mode : tuple of AddressMode
+        Boundary behavior per axis. May be a single :class:`AddressMode` (applied
+        to all axes) or a tuple of 1-3 entries (one per dimension).
+    filter_mode : FilterMode
+        Texel sampling mode. Default ``POINT``.
+    read_mode : ReadMode
+        How sampled integer values are returned. Default ``ELEMENT_TYPE``.
+    normalized_coords : bool
+        If True, coordinates are in ``[0, 1]`` instead of pixel indices.
+    srgb : bool
+        If True, perform sRGB → linear conversion on read (8-bit formats only).
+    disable_trilinear_optimization : bool
+        If True, request exact trilinear filtering.
+    seamless_cubemap : bool
+        If True, enable seamless cubemap edge filtering.
+    max_anisotropy : int
+        Maximum anisotropy; 0 disables anisotropic filtering.
+    mipmap_filter_mode : FilterMode
+        Filtering between mipmap levels. Default ``POINT``.
+    mipmap_level_bias : float
+    min_mipmap_level_clamp : float
+    max_mipmap_level_clamp : float
+    border_color : tuple of float or None
+        4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
+        zero.
+    """
+
+    address_mode: object = AddressMode.CLAMP
+    filter_mode: FilterMode = FilterMode.POINT
+    read_mode: ReadMode = ReadMode.ELEMENT_TYPE
+    normalized_coords: bool = False
+    srgb: bool = False
+    disable_trilinear_optimization: bool = False
+    seamless_cubemap: bool = False
+    max_anisotropy: int = 0
+    mipmap_filter_mode: FilterMode = FilterMode.POINT
+    mipmap_level_bias: float = 0.0
+    min_mipmap_level_clamp: float = 0.0
+    max_mipmap_level_clamp: float = 0.0
+    border_color: tuple | None = None
+
+
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("TextureObject requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
+cdef _normalize_address_modes(address_mode):
+    """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
+    if isinstance(address_mode, AddressMode):
+        return (address_mode, address_mode, address_mode)
+    try:
+        modes = tuple(address_mode)
+    except TypeError as e:
+        raise TypeError(
+            "address_mode must be an AddressMode or a tuple of AddressMode"
+        ) from e
+    if not 1 <= len(modes) <= 3:
+        raise ValueError(
+            f"address_mode tuple must have 1-3 entries, got {len(modes)}"
+        )
+    for i, m in enumerate(modes):
+        if not isinstance(m, AddressMode):
+            raise TypeError(
+                f"address_mode[{i}] must be an AddressMode, got {type(m).__name__}"
+            )
+    # Pad to 3 entries by repeating the last one.
+    padded = list(modes) + [modes[-1]] * (3 - len(modes))
+    return tuple(padded)
+
+
+cdef class TextureObject:
+    """A bindless texture handle for kernel-side sampled reads.
+
+    Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
+    :class:`Array` referenced by the descriptor) is kept alive for the
+    lifetime of this object to prevent dangling handles.
+
+    Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
+    handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "TextureObject cannot be instantiated directly. "
+            "Use TextureObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, resource_desc, texture_desc):
+        """Create a texture object from a resource + sampling descriptor.
+
+        Parameters
+        ----------
+        resource_desc : ResourceDescriptor
+        texture_desc : TextureDescriptor
+        """
+        if not isinstance(resource_desc, ResourceDescriptor):
+            raise TypeError(
+                f"resource_desc must be a ResourceDescriptor, got "
+                f"{type(resource_desc).__name__}"
+            )
+        if not isinstance(texture_desc, TextureDescriptor):
+            raise TypeError(
+                f"texture_desc must be a TextureDescriptor, got "
+                f"{type(texture_desc).__name__}"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        cdef cydriver.CUDA_TEXTURE_DESC tex_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        memset(&tex_desc, 0, sizeof(tex_desc))
+
+        # --- Resource descriptor ---
+        cdef Array arr
+        if resource_desc.kind == "array":
+            arr = <Array>resource_desc.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+            res_desc.res.array.hArray = arr._handle
+        else:
+            raise NotImplementedError(
+                f"ResourceDescriptor kind {resource_desc.kind!r} is not yet supported"
+            )
+
+        # --- Texture descriptor ---
+        modes = _normalize_address_modes(texture_desc.address_mode)
+        tex_desc.addressMode[0] = <cydriver.CUaddress_mode><int>modes[0]
+        tex_desc.addressMode[1] = <cydriver.CUaddress_mode><int>modes[1]
+        tex_desc.addressMode[2] = <cydriver.CUaddress_mode><int>modes[2]
+
+        if not isinstance(texture_desc.filter_mode, FilterMode):
+            raise TypeError("filter_mode must be a FilterMode")
+        tex_desc.filterMode = <cydriver.CUfilter_mode><int>texture_desc.filter_mode
+
+        if not isinstance(texture_desc.read_mode, ReadMode):
+            raise TypeError("read_mode must be a ReadMode")
+
+        cdef unsigned int flags = 0
+        # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to
+        # ReadMode.ELEMENT_TYPE.
+        if texture_desc.read_mode == ReadMode.ELEMENT_TYPE:
+            flags |= _TRSF_READ_AS_INTEGER
+        if texture_desc.normalized_coords:
+            flags |= _TRSF_NORMALIZED_COORDINATES
+        if texture_desc.srgb:
+            flags |= _TRSF_SRGB
+        if texture_desc.disable_trilinear_optimization:
+            flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION
+        if texture_desc.seamless_cubemap:
+            flags |= _TRSF_SEAMLESS_CUBEMAP
+        tex_desc.flags = flags
+
+        if texture_desc.max_anisotropy < 0:
+            raise ValueError("max_anisotropy must be >= 0")
+        tex_desc.maxAnisotropy = <unsigned int>texture_desc.max_anisotropy
+
+        if not isinstance(texture_desc.mipmap_filter_mode, FilterMode):
+            raise TypeError("mipmap_filter_mode must be a FilterMode")
+        tex_desc.mipmapFilterMode = <cydriver.CUfilter_mode><int>texture_desc.mipmap_filter_mode
+        tex_desc.mipmapLevelBias = <float>texture_desc.mipmap_level_bias
+        tex_desc.minMipmapLevelClamp = <float>texture_desc.min_mipmap_level_clamp
+        tex_desc.maxMipmapLevelClamp = <float>texture_desc.max_mipmap_level_clamp
+
+        cdef int i
+        if texture_desc.border_color is None:
+            for i in range(4):
+                tex_desc.borderColor[i] = 0.0
+        else:
+            bc = tuple(texture_desc.border_color)
+            if len(bc) != 4:
+                raise ValueError(
+                    f"border_color must have 4 elements, got {len(bc)}"
+                )
+            for i in range(4):
+                tex_desc.borderColor[i] = <float>bc[i]
+
+        cdef TextureObject self = cls.__new__(cls)
+        self._source_ref = resource_desc
+        self._texture_desc = texture_desc
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuTexObjectCreate(&self._handle, &res_desc, &tex_desc, NULL)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUtexObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this texture was built from."""
+        return self._source_ref
+
+    @property
+    def texture_descriptor(self):
+        """The :class:`TextureDescriptor` this texture was built from."""
+        return self._texture_desc
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUtexObject``."""
+        if self._handle != 0:
+            HANDLE_RETURN(cydriver.cuTexObjectDestroy(self._handle))
+        self._handle = 0
+        self._source_ref = None
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuTexObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuTexObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"TextureObject(handle=0x{<intptr_t>self._handle:x})"

From 0a0948a789bf2f4133b08ed069925640b539f732 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 17:14:35 -0700
Subject: [PATCH 04/17] Add SurfaceObject for kernel-side typed load/store
 (refs #467)

Completes the second half of #467 alongside the existing TextureObject:

- SurfaceObject wraps cuSurfObjectCreate / cuSurfObjectDestroy. Unlike a
  texture it has no sampling state (no filter mode, no addressing, no
  normalization); kernels read and write through it with integer pixel
  coordinates.
- Track CUDA_ARRAY3D_SURFACE_LDST on Array as a new surface_load_store
  property, populated in both Array.from_descriptor and
  Array._from_handle. SurfaceObject.from_array validates this upfront
  rather than letting the driver surface CUDA_ERROR_INVALID_VALUE late.
- Add a convenience SurfaceObject.from_array shortcut next to
  from_descriptor so the common case skips building a ResourceDescriptor
  by hand.

Covered by tests/test_texture_surface.py (14 tests: array shape/format/
flag plumbing, texture + surface creation, surface_load_store validation,
unsupported-resource-kind guard).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py         |   1 +
 cuda_core/cuda/core/_array.pxd          |   1 +
 cuda_core/cuda/core/_array.pyx          |   8 +
 cuda_core/cuda/core/_surface.pxd        |  17 +++
 cuda_core/cuda/core/_surface.pyx        | 144 ++++++++++++++++++
 cuda_core/tests/test_texture_surface.py | 187 ++++++++++++++++++++++++
 6 files changed, 358 insertions(+)
 create mode 100644 cuda_core/cuda/core/_surface.pxd
 create mode 100644 cuda_core/cuda/core/_surface.pyx
 create mode 100644 cuda_core/tests/test_texture_surface.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 0f84d79de53..245128352c0 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -87,6 +87,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     TextureDescriptor,
     TextureObject,
 )
+from cuda.core._surface import SurfaceObject
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
 from cuda.core._launch_config import LaunchConfig
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
index 9b6e1dad5bd..d5b08b45dc4 100644
--- a/cuda_core/cuda/core/_array.pxd
+++ b/cuda_core/cuda/core/_array.pxd
@@ -16,5 +16,6 @@ cdef class Array:
         int _device_id
         intptr_t _context
         bint _owning
+        bint _surface_load_store
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 3d1373b3023..50a3610a552 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -242,6 +242,7 @@ cdef class Array:
         self._shape = shape_t
         self._format = int(format)
         self._num_channels = num_channels
+        self._surface_load_store = bool(surface_load_store)
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
@@ -303,6 +304,7 @@ cdef class Array:
             self._shape = (int(desc.Width),)
         self._format = <int>desc.Format
         self._num_channels = desc.NumChannels
+        self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST)
         return self
 
     @property
@@ -336,6 +338,12 @@ cdef class Array:
         from cuda.core._device import Device
         return Device(self._device_id)
 
+    @property
+    def surface_load_store(self):
+        """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
+        and can be bound as a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
     def _extent_bytes(self):
         """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
         normalized to >=1 for lower-rank arrays."""
diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd
new file mode 100644
index 00000000000..ba7791d5172
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pxd
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class SurfaceObject:
+
+    cdef:
+        cydriver.CUsurfObject _handle
+        object _source_ref      # keep backing Array alive
+        int _device_id
+        intptr_t _context
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
new file mode 100644
index 00000000000..46213eee17d
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._texture import ResourceDescriptor
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("SurfaceObject requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
+cdef class SurfaceObject:
+    """A bindless surface handle for kernel-side typed load/store.
+
+    Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface
+    has no sampling state (no filtering, no addressing modes, no normalization);
+    kernels read and write through it using integer pixel coordinates.
+
+    The backing :class:`Array` must have been created with
+    ``surface_load_store=True`` and is kept alive for the lifetime of this
+    object to prevent dangling handles.
+
+    Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
+    kernels as a 64-bit handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "SurfaceObject cannot be instantiated directly. "
+            "Use SurfaceObject.from_array() or SurfaceObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Create a surface object directly from an :class:`Array`.
+
+        The array must have been created with ``surface_load_store=True``.
+        """
+        if not isinstance(array, Array):
+            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        return cls.from_descriptor(ResourceDescriptor.from_array(array))
+
+    @classmethod
+    def from_descriptor(cls, resource_desc):
+        """Create a surface object from a :class:`ResourceDescriptor`.
+
+        Parameters
+        ----------
+        resource_desc : ResourceDescriptor
+            Must wrap an :class:`Array` allocated with
+            ``surface_load_store=True``. Linear/pitch2d resources are not
+            valid surface backings.
+        """
+        if not isinstance(resource_desc, ResourceDescriptor):
+            raise TypeError(
+                f"resource_desc must be a ResourceDescriptor, got "
+                f"{type(resource_desc).__name__}"
+            )
+        if resource_desc.kind != "array":
+            raise ValueError(
+                f"SurfaceObject requires an array-backed ResourceDescriptor, "
+                f"got kind={resource_desc.kind!r}"
+            )
+
+        cdef Array arr = <Array>resource_desc.source
+        if not arr.surface_load_store:
+            raise ValueError(
+                "Array must be created with surface_load_store=True to be "
+                "bound as a SurfaceObject"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+        res_desc.res.array.hArray = arr._handle
+
+        cdef SurfaceObject self = cls.__new__(cls)
+        self._source_ref = resource_desc
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuSurfObjectCreate(&self._handle, &res_desc)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this surface was built from."""
+        return self._source_ref
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUsurfObject``."""
+        if self._handle != 0:
+            HANDLE_RETURN(cydriver.cuSurfObjectDestroy(self._handle))
+        self._handle = 0
+        self._source_ref = None
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuSurfObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuSurfObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"SurfaceObject(handle=0x{<intptr_t>self._handle:x})"
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
new file mode 100644
index 00000000000..d24ecbec796
--- /dev/null
+++ b/cuda_core/tests/test_texture_surface.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import cuda.core
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+)
+
+
+def test_array_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^Array cannot be instantiated directly"):
+        cuda.core._array.Array()
+
+
+def test_texture_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^TextureObject cannot be instantiated directly"):
+        cuda.core._texture.TextureObject()
+
+
+def test_surface_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^SurfaceObject cannot be instantiated directly"):
+        cuda.core._surface.SurfaceObject()
+
+
+def test_resource_descriptor_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^ResourceDescriptor cannot be instantiated"):
+        ResourceDescriptor()
+
+
+def test_array_2d_create_and_properties(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        assert arr.shape == (32, 16)
+        assert arr.format == ArrayFormat.FLOAT32
+        assert arr.num_channels == 1
+        assert arr.element_size == 4
+        assert arr.size_bytes == 32 * 16 * 4
+        assert arr.surface_load_store is False
+        assert arr.handle != 0
+        assert isinstance(arr.device, Device)
+    finally:
+        arr.close()
+
+
+def test_array_3d_with_surface_flag(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8, 4),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    try:
+        assert arr.shape == (8, 8, 4)
+        assert arr.surface_load_store is True
+        assert arr.element_size == 4
+    finally:
+        arr.close()
+
+
+def test_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        Array.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3)
+
+
+def test_array_rejects_bad_rank(init_cuda):
+    with pytest.raises(ValueError, match="shape rank"):
+        Array.from_descriptor(
+            shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_array_roundtrip_copy(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    try:
+        src = _array.array("I", list(range(16)))
+        dst = _array.array("I", [0] * 16)
+        arr.copy_from(src, stream=stream)
+        arr.copy_to(dst, stream=stream)
+        stream.sync()
+        assert list(dst) == list(range(16))
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_texture_object_create(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            read_mode=ReadMode.ELEMENT_TYPE,
+            normalized_coords=True,
+        )
+        tex = TextureObject.from_descriptor(res, tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+            assert tex.texture_descriptor is tex_desc
+        finally:
+            tex.close()
+    finally:
+        arr.close()
+
+
+def test_surface_object_create(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    try:
+        surf = SurfaceObject.from_array(arr)
+        try:
+            assert surf.handle != 0
+            assert isinstance(surf.resource, ResourceDescriptor)
+        finally:
+            surf.close()
+    finally:
+        arr.close()
+
+
+def test_surface_requires_ldst_flag(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4
+    )
+    try:
+        with pytest.raises(ValueError, match="surface_load_store=True"):
+            SurfaceObject.from_array(arr)
+    finally:
+        arr.close()
+
+
+def test_surface_rejects_non_array_resource(init_cuda):
+    # ResourceDescriptor only exposes from_array today, so use a fake kind.
+    arr = Array.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        res._kind = "linear"  # simulate a future, unsupported resource kind
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(res)
+    finally:
+        arr.close()
+
+
+def test_address_mode_normalization(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        # Per-axis tuple shorter than 3 should be accepted and padded.
+        tex_desc = TextureDescriptor(
+            address_mode=(AddressMode.WRAP, AddressMode.CLAMP)
+        )
+        tex = TextureObject.from_descriptor(res, tex_desc)
+        tex.close()
+    finally:
+        arr.close()

From 27b455435dbf4edd8c9592aa0f18af02d62b810c Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 17:19:42 -0700
Subject: [PATCH 05/17] Add ResourceDescriptor.from_linear and .from_pitch2d
 (refs #467)

Widens the texture-resource surface to cover the two Buffer-backed
variants from CUDA_RESOURCE_DESC:

- ResourceDescriptor.from_linear(buffer, *, format, num_channels,
  size_bytes=None) wraps a Buffer as a typed 1D fetch. Defaults
  size_bytes to buffer.size; validates against it.
- ResourceDescriptor.from_pitch2d(buffer, *, format, num_channels,
  width, height, pitch_bytes) wraps a Buffer as a row-pitched 2D
  image. Validates pitch_bytes >= width * element_size and
  pitch_bytes * height <= buffer.size; the driver enforces its own
  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT on top.
- TextureObject.from_descriptor handles the three resType branches
  (ARRAY, LINEAR, PITCH2D); SurfaceObject continues to require an
  array-backed resource.
- ResourceDescriptor gains format/num_channels read-only properties
  (None for array-backed) and a kind-aware __repr__.

Tests: 9 new (linear/pitch2D creation, validation paths, surface
rejection of non-array resources) on top of the existing 14. Full
test-core suite green (3287 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_texture.pyx        | 186 +++++++++++++++++++++-
 cuda_core/tests/test_texture_surface.py | 197 ++++++++++++++++++++++--
 2 files changed, 361 insertions(+), 22 deletions(-)

diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 7f869c47782..6d06acb8826 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -9,6 +9,8 @@ from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
 from cuda.core._array cimport Array
+from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
+from cuda.core._memory._buffer cimport Buffer
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import enum
@@ -53,13 +55,26 @@ class ReadMode(enum.IntEnum):
 class ResourceDescriptor:
     """Describes the memory backing a :class:`TextureObject`.
 
-    Construct via the ``from_*`` classmethods. Only the ``from_array`` path is
-    implemented in this initial version; ``from_linear`` and ``from_pitch2d``
-    will follow once their metadata story (format/channel count on
-    :class:`Buffer`) is settled.
+    Construct via the ``from_*`` classmethods:
+
+    - :meth:`from_array` wraps a :class:`Array` (works for both
+      :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
+      objects built from a linear resource do not support filtering,
+      normalized coordinates, or addressing modes.
+    - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image.
+      Supports filtering and 2D addressing, but only 2D access.
+
+    Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
+    require an :class:`Array` allocated with ``surface_load_store=True``.
     """
 
-    __slots__ = ("_kind", "_source")
+    __slots__ = (
+        "_kind", "_source",
+        "_format", "_num_channels",
+        "_size_bytes",
+        "_width", "_height", "_pitch_bytes",
+    )
 
     def __init__(self):
         raise RuntimeError(
@@ -75,6 +90,125 @@ class ResourceDescriptor:
         self = cls.__new__(cls)
         self._kind = "array"
         self._source = array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
+        """Build a resource descriptor for a linear (typed 1D) texture fetch.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        size_bytes : int, optional
+            Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not
+            exceed it.
+
+        Notes
+        -----
+        Texture objects built from a linear resource ignore the
+        :class:`TextureDescriptor` addressing/filtering fields — kernels read
+        through a typed 1D fetch with bounds checking only.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+
+        buf_size = int(buffer.size)
+        if size_bytes is None:
+            size = buf_size
+        else:
+            size = int(size_bytes)
+            if size < 0:
+                raise ValueError(f"size_bytes must be >= 0, got {size}")
+            if size > buf_size:
+                raise ValueError(
+                    f"size_bytes ({size}) exceeds buffer.size ({buf_size})"
+                )
+
+        self = cls.__new__(cls)
+        self._kind = "linear"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = size
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_pitch2d(
+        cls, buffer, *, format, num_channels, width, height, pitch_bytes
+    ):
+        """Build a resource descriptor for a row-pitched 2D image.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        width : int
+            Image width, in elements.
+        height : int
+            Image height, in rows.
+        pitch_bytes : int
+            Distance between consecutive rows, in bytes. Must be at least
+            ``width * format_size * num_channels`` and meet the driver's
+            ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+
+        w = int(width)
+        h = int(height)
+        p = int(pitch_bytes)
+        if w < 1:
+            raise ValueError(f"width must be >= 1, got {w}")
+        if h < 1:
+            raise ValueError(f"height must be >= 1, got {h}")
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
+        min_pitch = w * elem
+        if p < min_pitch:
+            raise ValueError(
+                f"pitch_bytes ({p}) must be >= width * element_size ({min_pitch})"
+            )
+        if p * h > int(buffer.size):
+            raise ValueError(
+                f"pitch_bytes * height ({p * h}) exceeds buffer.size ({int(buffer.size)})"
+            )
+
+        self = cls.__new__(cls)
+        self._kind = "pitch2d"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = None
+        self._width = w
+        self._height = h
+        self._pitch_bytes = p
         return self
 
     @property
@@ -85,7 +219,29 @@ class ResourceDescriptor:
     def source(self):
         return self._source
 
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat` (``None`` for array-backed)."""
+        return None if self._format is None else ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (``None`` for array-backed)."""
+        return self._num_channels
+
     def __repr__(self):
+        if self._kind == "linear":
+            return (
+                f"ResourceDescriptor(kind='linear', format={self.format.name}, "
+                f"num_channels={self._num_channels}, size_bytes={self._size_bytes})"
+            )
+        if self._kind == "pitch2d":
+            return (
+                f"ResourceDescriptor(kind='pitch2d', format={self.format.name}, "
+                f"num_channels={self._num_channels}, "
+                f"width={self._width}, height={self._height}, "
+                f"pitch_bytes={self._pitch_bytes})"
+            )
         return f"ResourceDescriptor(kind={self._kind!r})"
 
 
@@ -221,10 +377,30 @@ cdef class TextureObject:
 
         # --- Resource descriptor ---
         cdef Array arr
+        cdef Buffer buf
+        cdef intptr_t devptr
         if resource_desc.kind == "array":
             arr = <Array>resource_desc.source
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
             res_desc.res.array.hArray = arr._handle
+        elif resource_desc.kind == "linear":
+            buf = <Buffer>resource_desc.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR
+            res_desc.res.linear.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.linear.format = <cydriver.CUarray_format><int>resource_desc._format
+            res_desc.res.linear.numChannels = <unsigned int>resource_desc._num_channels
+            res_desc.res.linear.sizeInBytes = <size_t>resource_desc._size_bytes
+        elif resource_desc.kind == "pitch2d":
+            buf = <Buffer>resource_desc.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D
+            res_desc.res.pitch2D.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.pitch2D.format = <cydriver.CUarray_format><int>resource_desc._format
+            res_desc.res.pitch2D.numChannels = <unsigned int>resource_desc._num_channels
+            res_desc.res.pitch2D.width = <size_t>resource_desc._width
+            res_desc.res.pitch2D.height = <size_t>resource_desc._height
+            res_desc.res.pitch2D.pitchInBytes = <size_t>resource_desc._pitch_bytes
         else:
             raise NotImplementedError(
                 f"ResourceDescriptor kind {resource_desc.kind!r} is not yet supported"
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index d24ecbec796..89172b58f67 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -154,23 +154,6 @@ def test_surface_requires_ldst_flag(init_cuda):
         arr.close()
 
 
-def test_surface_rejects_non_array_resource(init_cuda):
-    # ResourceDescriptor only exposes from_array today, so use a fake kind.
-    arr = Array.from_descriptor(
-        shape=(8, 8),
-        format=ArrayFormat.UINT8,
-        num_channels=4,
-        surface_load_store=True,
-    )
-    try:
-        res = ResourceDescriptor.from_array(arr)
-        res._kind = "linear"  # simulate a future, unsupported resource kind
-        with pytest.raises(ValueError, match="array-backed"):
-            SurfaceObject.from_descriptor(res)
-    finally:
-        arr.close()
-
-
 def test_address_mode_normalization(init_cuda):
     arr = Array.from_descriptor(
         shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
@@ -185,3 +168,183 @@ def test_address_mode_normalization(init_cuda):
         tex.close()
     finally:
         arr.close()
+
+
+# --- Linear / pitch2D resource descriptors -----------------------------------
+
+def _alloc_device_buffer(device, nbytes):
+    """Allocate a device Buffer using the device's default memory resource."""
+    return device.memory_resource.allocate(nbytes, stream=device.default_stream)
+
+
+def test_resource_descriptor_from_linear_defaults_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.FLOAT32, num_channels=1
+        )
+        assert res.kind == "linear"
+        assert res.format == ArrayFormat.FLOAT32
+        assert res.num_channels == 1
+        assert res.source is buf
+        # repr should include the kind/format hint
+        assert "linear" in repr(res)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_size_override(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048
+        )
+        assert res._size_bytes == 2048
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_oversize(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT8, num_channels=3
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_buffer():
+    with pytest.raises(TypeError, match="Buffer"):
+        ResourceDescriptor.from_linear(
+            object(), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_texture_object_from_linear(init_cuda):
+    """A linear-backed texture should bind even though sampling fields are
+    effectively ignored by the driver."""
+    device = Device()
+    # 1024 float elements
+    buf = _alloc_device_buffer(device, 1024 * 4)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.FLOAT32, num_channels=1
+        )
+        tex = TextureObject.from_descriptor(res, TextureDescriptor())
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_pitch(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 64 * 1024)
+    try:
+        # element_size = 4 (UINT32 * 1 channel); width=16 -> min_pitch=64
+        with pytest.raises(ValueError, match="pitch_bytes"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT32,
+                num_channels=1,
+                width=16,
+                height=8,
+                pitch_bytes=32,  # < 64 = width*element_size
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_buffer_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=4,
+                width=64,
+                height=128,
+                pitch_bytes=512,  # 512 * 128 = 65536 > 4096
+            )
+    finally:
+        buf.close()
+
+
+def test_texture_object_from_pitch2d(init_cuda):
+    """A pitch2D-backed texture should bind given driver-aligned pitch."""
+    from cuda.bindings import driver
+
+    device = Device()
+    # Query the device's required texture pitch alignment (typically 32-512).
+    err, align = driver.cuDeviceGetAttribute(
+        driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT,
+        device.device_id,
+    )
+    assert int(err) == 0
+    pitch = max(int(align), 256)
+    height = 16
+    buf = _alloc_device_buffer(device, pitch * height)
+    try:
+        res = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=32,
+            height=height,
+            pitch_bytes=pitch,
+        )
+        assert res.kind == "pitch2d"
+        assert "pitch2d" in repr(res)
+        tex = TextureObject.from_descriptor(res, TextureDescriptor())
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_surface_rejects_linear_and_pitch2d(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res_lin = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.UINT32, num_channels=1
+        )
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(res_lin)
+
+        res_p2 = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(res_p2)
+    finally:
+        buf.close()

From 168a248407986676ef8c371aa845d10a415a9b86 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 17:27:29 -0700
Subject: [PATCH 06/17] Add docs for texture/surface APIs (refs #467)

Wire the newly public Array, ArrayFormat, TextureObject, SurfaceObject,
ResourceDescriptor, TextureDescriptor, AddressMode, FilterMode, and
ReadMode symbols into the cuda.core Sphinx reference under a new
"Textures and surfaces" section in api.rst. No source docstring changes;
documentation is rendered via the existing autosummary templates and the
enum_documenter extension.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/docs/source/api.rst | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 0a88a5bd4b6..d3c9b761510 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -159,6 +159,38 @@ Tensor Memory Accelerator (TMA)
    TensorMapDescriptorOptions
 
 
+Textures and surfaces
+---------------------
+
+CUDA arrays back bindless texture and surface objects for kernel-side sampled
+reads and typed load/store. :class:`Array` is allocated through
+:meth:`Array.from_descriptor` and bound through a :class:`ResourceDescriptor`
+factory; linear (1D) and row-pitched 2D :class:`Buffer` views are also
+supported as texture backings.
+
+.. autosummary::
+   :toctree: generated/
+
+   :template: autosummary/cyclass.rst
+
+   Array
+   ResourceDescriptor
+   TextureObject
+   SurfaceObject
+
+   :template: dataclass.rst
+
+   TextureDescriptor
+
+.. autosummary::
+   :toctree: generated/
+
+   ArrayFormat
+   AddressMode
+   FilterMode
+   ReadMode
+
+
 CUDA compilation toolchain
 --------------------------
 

From 53f993e2ab018f5323627dec2e5662dd42b27ae3 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 17:37:52 -0700
Subject: [PATCH 07/17] Add MipmappedArray and
 ResourceDescriptor.from_mipmapped_array (refs #467)

Introduces a MipmappedArray cdef class wrapping CUmipmappedArray with the
same lifetime model as Array (close/__dealloc__/context-manager). Levels
are obtained via get_level(L), which returns a non-owning Array that
holds a strong ref back to the parent MipmappedArray via a new
Array._parent_ref slot, ensuring level views cannot outlive the
underlying storage. Surfaces continue to require a single-Array backing;
the existing kind != "array" check in SurfaceObject.from_descriptor
naturally rejects mipmapped resources (covered by a new test).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py          |   1 +
 cuda_core/cuda/core/_array.pxd           |   4 +
 cuda_core/cuda/core/_array.pyx           |   5 +
 cuda_core/cuda/core/_mipmapped_array.pxd |  22 +++
 cuda_core/cuda/core/_mipmapped_array.pyx | 239 +++++++++++++++++++++++
 cuda_core/cuda/core/_texture.pyx         |  32 +++
 cuda_core/tests/test_texture_surface.py  | 205 +++++++++++++++++++
 7 files changed, 508 insertions(+)
 create mode 100644 cuda_core/cuda/core/_mipmapped_array.pxd
 create mode 100644 cuda_core/cuda/core/_mipmapped_array.pyx

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 245128352c0..9769a39977f 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -79,6 +79,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResourceOptions,
 )
 from cuda.core._array import Array, ArrayFormat
+from cuda.core._mipmapped_array import MipmappedArray
 from cuda.core._texture import (
     AddressMode,
     FilterMode,
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
index d5b08b45dc4..49ae1075d8b 100644
--- a/cuda_core/cuda/core/_array.pxd
+++ b/cuda_core/cuda/core/_array.pxd
@@ -17,5 +17,9 @@ cdef class Array:
         intptr_t _context
         bint _owning
         bint _surface_load_store
+        # Optional strong reference to a parent owner (e.g. a MipmappedArray
+        # whose level this Array views). When set, the parent must outlive
+        # this Array because the underlying CUarray belongs to the parent.
+        object _parent_ref
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 50a3610a552..37c5439ddb3 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -245,6 +245,7 @@ cdef class Array:
         self._surface_load_store = bool(surface_load_store)
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
+        self._parent_ref = None
 
         cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
         cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
@@ -291,6 +292,7 @@ cdef class Array:
         self._owning = owning
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id() if device_id is None else int(device_id)
+        self._parent_ref = None
 
         cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc
         with nogil:
@@ -394,6 +396,9 @@ cdef class Array:
         if self._handle != NULL and self._owning:
             HANDLE_RETURN(cydriver.cuArrayDestroy(self._handle))
         self._handle = NULL
+        # Drop the parent reference (if any) so a non-owning level Array
+        # stops pinning its MipmappedArray after close().
+        self._parent_ref = None
 
     def __dealloc__(self):
         # Cython destructors cannot raise; any cuArrayDestroy error here is
diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd
new file mode 100644
index 00000000000..52aa0dc863e
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pxd
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class MipmappedArray:
+
+    cdef:
+        cydriver.CUmipmappedArray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        int _format                  # CUarray_format value
+        unsigned int _num_channels   # 1, 2, or 4
+        unsigned int _num_levels
+        int _device_id
+        intptr_t _context
+        bint _owning
+        bint _surface_load_store
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
new file mode 100644
index 00000000000..a4cfd40bb80
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -0,0 +1,239 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._array import ArrayFormat
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("MipmappedArray allocation requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
+cdef class MipmappedArray:
+    """A mipmapped CUDA array for texture/surface access across levels.
+
+    Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
+    allocation accessible only via a :class:`TextureObject` (or by retrieving
+    the level's :class:`Array` and binding it as a :class:`SurfaceObject`).
+    Destroying the :class:`MipmappedArray` destroys all level arrays
+    implicitly, so the :class:`Array` instances returned by :meth:`get_level`
+    are non-owning and hold a strong reference back to their parent.
+
+    Construct via :meth:`from_descriptor`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "MipmappedArray cannot be instantiated directly. "
+            "Use MipmappedArray.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(
+        cls, *, shape, format, num_channels, num_levels, surface_load_store=False
+    ):
+        """Allocate a new mipmapped CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements, for the base (level 0) mip.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        num_levels : int
+            Number of mip levels to allocate; must be >= 1. The driver caps
+            this at the log2 of the largest dimension; passing a larger value
+            yields a driver error.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
+            levels (obtained via :meth:`get_level`) can be bound as
+            :class:`SurfaceObject` for kernel-side writes. Default False.
+
+        Returns
+        -------
+        MipmappedArray
+        """
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format)}")
+        if num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+
+        try:
+            shape_t = tuple(int(s) for s in shape)
+        except TypeError as e:
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape)}") from e
+        if not 1 <= len(shape_t) <= 3:
+            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+        for i, dim in enumerate(shape_t):
+            if dim < 1:
+                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+
+        levels = int(num_levels)
+        if levels < 1:
+            raise ValueError(f"num_levels must be >= 1, got {levels}")
+
+        cdef MipmappedArray self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = int(format)
+        self._num_channels = num_channels
+        self._num_levels = <unsigned int>levels
+        self._surface_load_store = bool(surface_load_store)
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+        )
+        cdef unsigned int c_levels = <unsigned int>levels
+
+        # Mipmap creation uses the 3D descriptor regardless of rank; lower-rank
+        # shapes use Height=0/Depth=0 sentinels, matching cuArray3DCreate.
+        memset(&desc3d, 0, sizeof(desc3d))
+        desc3d.Width = <size_t>shape_t[0]
+        desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+        desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+        desc3d.Format = c_format
+        desc3d.NumChannels = <unsigned int>num_channels
+        desc3d.Flags = flags
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayCreate(&self._handle, &desc3d, c_levels)
+            )
+
+        return self
+
+    def get_level(self, level):
+        """Return a non-owning :class:`Array` view of the given mip level.
+
+        Parameters
+        ----------
+        level : int
+            Mip level index in ``[0, num_levels)``.
+
+        Returns
+        -------
+        Array
+            A non-owning :class:`Array` wrapping the level's ``CUarray``.
+            The :class:`MipmappedArray` is kept alive for the lifetime of the
+            returned :class:`Array`; the underlying storage is released only
+            when this :class:`MipmappedArray` is destroyed.
+        """
+        lvl = int(level)
+        if lvl < 0:
+            raise ValueError(f"level must be >= 0, got {lvl}")
+        if lvl >= <int>self._num_levels:
+            raise ValueError(
+                f"level ({lvl}) must be < num_levels ({self._num_levels})"
+            )
+
+        cdef cydriver.CUarray level_handle
+        cdef unsigned int c_level = <unsigned int>lvl
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level)
+            )
+
+        # Wrap as a non-owning Array; the level's underlying CUarray belongs
+        # to this MipmappedArray and must not be destroyed independently.
+        arr = Array._from_handle(
+            <intptr_t>level_handle, False, device_id=self._device_id
+        )
+        # Strong ref back to the parent so the mipmap outlives the level view.
+        (<Array>arr)._parent_ref = self
+        return arr
+
+    @property
+    def handle(self):
+        """The underlying ``CUmipmappedArray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Base-level (level 0) allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def num_levels(self):
+        """Number of mip levels."""
+        return int(self._num_levels)
+
+    @property
+    def surface_load_store(self):
+        """True if this mipmap (and each of its levels) was created with
+        ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
+    @property
+    def device(self):
+        """The :class:`Device` this mipmap was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUmipmappedArray`` if owned.
+
+        After ``close()`` any level :class:`Array` returned by :meth:`get_level`
+        becomes invalid; callers must not access them.
+        """
+        if self._handle != NULL and self._owning:
+            HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(self._handle))
+        self._handle = NULL
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuMipmappedArrayDestroy error
+        # here is silently dropped. Callers needing visibility should use
+        # close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuMipmappedArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"MipmappedArray(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels}, "
+            f"num_levels={self._num_levels})"
+        )
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 6d06acb8826..8fcc5586e8d 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -11,6 +11,8 @@ from cuda.bindings cimport cydriver
 from cuda.core._array cimport Array
 from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
 from cuda.core._memory._buffer cimport Buffer
+from cuda.core._mipmapped_array cimport MipmappedArray
+from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import enum
@@ -98,6 +100,31 @@ class ResourceDescriptor:
         self._pitch_bytes = None
         return self
 
+    @classmethod
+    def from_mipmapped_array(cls, mipmapped_array):
+        """Build a resource descriptor backed by a :class:`MipmappedArray`.
+
+        Suitable for binding to a :class:`TextureObject` for mipmapped
+        sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
+        require a single :class:`Array` level (obtain via
+        :meth:`MipmappedArray.get_level`).
+        """
+        if not isinstance(mipmapped_array, _PyMipmappedArray):
+            raise TypeError(
+                f"mipmapped_array must be a MipmappedArray, got "
+                f"{type(mipmapped_array).__name__}"
+            )
+        self = cls.__new__(cls)
+        self._kind = "mipmapped_array"
+        self._source = mipmapped_array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
     @classmethod
     def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
         """Build a resource descriptor for a linear (typed 1D) texture fetch.
@@ -377,12 +404,17 @@ cdef class TextureObject:
 
         # --- Resource descriptor ---
         cdef Array arr
+        cdef MipmappedArray mip
         cdef Buffer buf
         cdef intptr_t devptr
         if resource_desc.kind == "array":
             arr = <Array>resource_desc.source
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
             res_desc.res.array.hArray = arr._handle
+        elif resource_desc.kind == "mipmapped_array":
+            mip = <MipmappedArray>resource_desc.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+            res_desc.res.mipmap.hMipmappedArray = mip._handle
         elif resource_desc.kind == "linear":
             buf = <Buffer>resource_desc.source
             devptr = int(buf.handle)
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index 89172b58f67..e9a3d3d6bb6 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import gc
+
 import pytest
 
 import cuda.core
@@ -10,6 +12,7 @@
     ArrayFormat,
     Device,
     FilterMode,
+    MipmappedArray,
     ReadMode,
     ResourceDescriptor,
     SurfaceObject,
@@ -348,3 +351,205 @@ def test_surface_rejects_linear_and_pitch2d(init_cuda):
             SurfaceObject.from_descriptor(res_p2)
     finally:
         buf.close()
+
+
+# --- MipmappedArray ----------------------------------------------------------
+
+def test_mipmapped_array_init_disabled():
+    with pytest.raises(
+        RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"
+    ):
+        cuda.core._mipmapped_array.MipmappedArray()
+
+
+def test_mipmapped_array_from_descriptor_2d(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(64, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=4,
+    )
+    try:
+        assert mip.shape == (64, 32)
+        assert mip.format == ArrayFormat.FLOAT32
+        assert mip.num_channels == 1
+        assert mip.num_levels == 4
+        assert mip.surface_load_store is False
+        assert mip.handle != 0
+        assert isinstance(mip.device, Device)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_zero_matches_shape(init_cuda):
+    shape = (64, 32)
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=4,
+    )
+    try:
+        lvl0 = mip.get_level(0)
+        try:
+            assert isinstance(lvl0, Array)
+            # Level 0 must match the base shape and rank.
+            assert lvl0.shape == shape
+            assert lvl0.format == ArrayFormat.UINT8
+            assert lvl0.num_channels == 4
+            assert lvl0.handle != 0
+        finally:
+            lvl0.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_halves_dims(init_cuda):
+    shape = (64, 32)
+    num_levels = 4
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=num_levels,
+    )
+    try:
+        for level in range(num_levels):
+            lvl = mip.get_level(level)
+            try:
+                # Each dim halves per level, with a floor of 1; rank is preserved.
+                expected = tuple(max(1, dim >> level) for dim in shape)
+                assert lvl.shape == expected, (
+                    f"level={level}: expected {expected}, got {lvl.shape}"
+                )
+            finally:
+                lvl.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_out_of_range(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=2,
+    )
+    try:
+        with pytest.raises(ValueError, match="num_levels"):
+            mip.get_level(mip.num_levels)
+        with pytest.raises(ValueError, match=">= 0"):
+            mip.get_level(-1)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_rejects_zero_levels(init_cuda):
+    with pytest.raises(ValueError, match="num_levels"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            num_levels=0,
+        )
+
+
+def test_resource_descriptor_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 16),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        assert res.kind == "mipmapped_array"
+        assert res.source is mip
+    finally:
+        mip.close()
+
+
+def test_resource_descriptor_from_mipmapped_array_rejects_non_mipmap():
+    with pytest.raises(TypeError, match="MipmappedArray"):
+        ResourceDescriptor.from_mipmapped_array(object())
+
+
+def test_texture_object_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        # Use non-default mipmap params so the driver exercises that path.
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            normalized_coords=True,
+            mipmap_filter_mode=FilterMode.LINEAR,
+            mipmap_level_bias=0.0,
+            min_mipmap_level_clamp=0.0,
+            max_mipmap_level_clamp=float(mip.num_levels - 1),
+        )
+        tex = TextureObject.from_descriptor(res, tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        mip.close()
+
+
+def test_surface_rejects_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=2,
+        surface_load_store=True,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(res)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
+    """Dropping the local parent reference must not invalidate the level Array;
+    the level holds an internal strong ref back to the MipmappedArray.
+
+    cdef classes don't natively support weakref, so we verify the parent
+    reference by inspecting the level Array's gc referents.
+    """
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=3,
+    )
+    parent_id = id(mip)
+    lvl = mip.get_level(1)
+    # Drop our local reference and force GC; the parent must survive because
+    # the level Array holds a strong ref via the internal _parent_ref slot.
+    del mip
+    gc.collect()
+
+    # The handle is still valid storage; the level still tracks the parent.
+    assert lvl.handle != 0
+    referents = gc.get_referents(lvl)
+    parents = [r for r in referents if isinstance(r, MipmappedArray)]
+    assert len(parents) == 1, (
+        f"level Array should reference exactly one MipmappedArray parent, got "
+        f"{parents!r}"
+    )
+    assert id(parents[0]) == parent_id, (
+        "level Array's parent ref is not the original MipmappedArray"
+    )
+    # Closing the level drops its parent ref. Don't access the parent past
+    # this point; cuMipmappedArrayDestroy may then run.
+    lvl.close()

From 229465e566484a08c961a8c5035a3da8bd494b4b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 13 May 2026 17:38:38 -0700
Subject: [PATCH 08/17] Add texture sampling example (refs #467)

End-to-end example that builds a 2D Array with a known pattern, binds it as
a bindless TextureObject with LINEAR/CLAMP/non-normalized sampling, and
launches a kernel that samples both texel-center and half-integer
coordinates. Verifies POINT-exact returns at texel centers and analytical
bilinear blends at half-pixel positions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/examples/texture_sample.py | 214 +++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 cuda_core/examples/texture_sample.py

diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
new file mode 100644
index 00000000000..68e4a964f1c
--- /dev/null
+++ b/cuda_core/examples/texture_sample.py
@@ -0,0 +1,214 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates building a 2D CUDA Array, binding it as a
+# bindless TextureObject, and sampling it from a kernel with both POINT-exact
+# and LINEAR-interpolated coordinates.
+#
+# Texture coordinate convention (non-normalized): each texel (i, j) is centered
+# at (i + 0.5, j + 0.5). So tex2D(tex, 0.5, 0.5) returns texel (0, 0) exactly,
+# while tex2D(tex, 1.0, 0.5) returns the linear blend of texels (0, 0) and (1, 0).
+# All test coordinates below are chosen with that half-pixel offset in mind.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc"]
+# ///
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# Kernel reads N (x, y) coordinates from `coords` (interleaved float pairs) and
+# writes tex2D<float>(tex, x, y) to out[i]. Compiled as C++ so the templated
+# tex2D<float> overload resolves.
+code = r"""
+extern "C" __global__
+void sample_texture(cudaTextureObject_t tex,
+                    float *out,
+                    const float *coords,
+                    int n) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    float x = coords[2 * i + 0];
+    float y = coords[2 * i + 1];
+    out[i] = tex2D<float>(tex, x, y);
+}
+"""
+
+
+def main():
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+
+    arr = None
+    tex = None
+    coords_buf = None
+    out_buf = None
+    pinned_mr = LegacyPinnedMemoryResource()
+    try:
+        # Allocate a 2D Array: shape=(W, H), single-channel float32.
+        # Note: Array.from_descriptor takes shape=(width, height), so the host
+        # buffer fed into copy_from must be laid out as H rows of W elements
+        # (row-major), i.e. host_pattern.shape == (H, W).
+        width, height = 16, 16
+        arr = Array.from_descriptor(
+            shape=(width, height),
+            format=ArrayFormat.FLOAT32,
+            num_channels=1,
+        )
+
+        # Plant a known pattern: pattern[y, x] = x + 100*y.
+        # Cast to float32 so the byte count matches the array's storage.
+        ys, xs = np.meshgrid(
+            np.arange(height, dtype=np.float32),
+            np.arange(width, dtype=np.float32),
+            indexing="ij",
+        )
+        pattern = (xs + 100.0 * ys).astype(np.float32)
+        assert pattern.shape == (height, width)
+        arr.copy_from(pattern, stream=stream)
+
+        # Build a linear-filtering, clamped, non-normalized texture.
+        res_desc = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            read_mode=ReadMode.ELEMENT_TYPE,
+            normalized_coords=False,
+        )
+        tex = TextureObject.from_descriptor(res_desc, tex_desc)
+
+        # Build the test coordinate list:
+        # - Texel-center samples should return the exact planted value.
+        # - Half-integer samples land between texels and exercise LINEAR
+        #   filtering -- they should equal the average of the surrounding
+        #   texels.
+        center_samples = [
+            (0.5, 0.5),  # -> pattern[0, 0] = 0
+            (3.5, 0.5),  # -> pattern[0, 3] = 3
+            (0.5, 4.5),  # -> pattern[4, 0] = 400
+            (7.5, 9.5),  # -> pattern[9, 7] = 907
+            (15.5, 15.5),  # -> pattern[15, 15] = 1515
+        ]
+        half_samples = [
+            # (1.0, 0.5): blend of texels (0, 0) and (1, 0) -> 0.5
+            (1.0, 0.5),
+            # (0.5, 1.0): blend of texels (0, 0) and (0, 1) -> 50.0
+            (0.5, 1.0),
+            # (1.0, 1.0): blend of the 2x2 block at (0..1, 0..1) -> 50.5
+            (1.0, 1.0),
+            # (4.0, 5.0): blend of the 2x2 block at (3..4, 4..5) -> 453.5
+            (4.0, 5.0),
+        ]
+        coords = np.array(center_samples + half_samples, dtype=np.float32)
+        n = coords.shape[0]
+        coords_flat = coords.reshape(-1)
+        coords_nbytes = int(coords_flat.nbytes)
+        out_nbytes = n * np.dtype(np.float32).itemsize
+
+        # Use pinned host memory for inputs and outputs. Pinned allocations are
+        # GPU-accessible (zero-copy), so the kernel can read coords directly
+        # and we can read results without a separate device->host copy.
+        coords_buf = pinned_mr.allocate(coords_nbytes)
+        out_buf = pinned_mr.allocate(out_nbytes)
+        coords_view = np.from_dlpack(coords_buf).view(dtype=np.float32)
+        out_view = np.from_dlpack(out_buf).view(dtype=np.float32)
+        coords_view[:] = coords_flat
+        out_view[:] = 0.0
+
+        # Compile the kernel as C++ (templated tex2D<float> requires this).
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin", name_expressions=("sample_texture",))
+        kernel = mod.get_kernel("sample_texture")
+
+        block = 64
+        grid = (n + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
+        # cudaTextureObject_t is a 64-bit handle; pass it as uint64 to be
+        # unambiguous (a bare Python int would also work since intptr_t is
+        # 8 bytes on 64-bit platforms).
+        launch(
+            stream,
+            config,
+            kernel,
+            np.uint64(tex.handle),
+            out_buf,
+            coords_buf,
+            np.int32(n),
+        )
+        stream.sync()
+        results = np.asarray(out_view)
+
+        # Verify texel-center samples (POINT-exact regardless of filter mode).
+        n_center = len(center_samples)
+        for i, (x, y) in enumerate(center_samples):
+            expected = (x - 0.5) + 100.0 * (y - 0.5)
+            got = float(results[i])
+            assert np.isclose(got, expected, atol=1e-4), (
+                f"center sample {i} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        # Verify half-integer samples against the analytic mean of the 4
+        # surrounding texels. Allow a small tolerance for the 1/256 fixed-point
+        # weight quantization that hardware filtering performs.
+        for j, (x, y) in enumerate(half_samples):
+            idx = n_center + j
+            # Surrounding integer texel coordinates: (xi, yi), (xi+1, yi),
+            # (xi, yi+1), (xi+1, yi+1). With x = xi + 1, y = yi + 1 (e.g.
+            # (1.0, 1.0)) the four neighbors are (0,0)..(1,1).
+            xi = int(np.floor(x - 0.5))
+            yi = int(np.floor(y - 0.5))
+            tx = (x - 0.5) - xi
+            ty = (y - 0.5) - yi
+            corners = []
+            for dy in (0, 1):
+                for dx in (0, 1):
+                    xv = min(max(xi + dx, 0), width - 1)
+                    yv = min(max(yi + dy, 0), height - 1)
+                    corners.append(pattern[yv, xv])
+            v00, v10, v01, v11 = corners
+            expected = (1 - tx) * (1 - ty) * v00 + tx * (1 - ty) * v10 + (1 - tx) * ty * v01 + tx * ty * v11
+            got = float(results[idx])
+            assert np.isclose(got, expected, atol=1e-2), (
+                f"half sample {j} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        print("Texture sampling example completed successfully.")
+        print(f"  texel-center samples verified: {n_center}")
+        print(f"  half-integer samples verified: {len(half_samples)}")
+    finally:
+        if tex is not None:
+            tex.close()
+        if arr is not None:
+            arr.close()
+        if coords_buf is not None:
+            coords_buf.close()
+        if out_buf is not None:
+            out_buf.close()
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()

From aec7fed2ed4856ca956c2ba2659f6cd89c4d2df2 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 15 May 2026 11:59:25 -0700
Subject: [PATCH 09/17] Address code review feedback on texture/surface stack
 (refs #467)

Safety and correctness:
- Validate buffer sizes against array extent in Array.copy_from/copy_to;
  undersized host or device Buffer inputs were previously silent stomps
  via cuMemcpy3DAsync. Both branches now raise ValueError before issuing
  the copy.
- Zero the underlying handle BEFORE calling cuXxxDestroy in close() for
  Array, MipmappedArray, TextureObject, SurfaceObject. Prevents a
  double-destroy via __dealloc__ if the driver call raises.
- ResourceDescriptor.from_linear: require size_bytes >= element_size and
  size_bytes % element_size == 0; previously accepted zero and arbitrary
  non-multiples.
- Reject bool in num_channels across Array, MipmappedArray, and the two
  Buffer-backed ResourceDescriptor factories (True was silently treated
  as 1 channel).

API polish:
- Rename TextureObject.from_descriptor params resource_desc/texture_desc
  to resource/texture_descriptor so they match the .resource and
  .texture_descriptor properties; same rename in SurfaceObject. Both
  factories are now keyword-only, consistent with Array.from_descriptor
  and MipmappedArray.from_descriptor.
- Add four ResourceDescriptor properties (size_bytes, width, height,
  pitch_bytes) so values shown in __repr__ are reachable programmatically.
- Add MipmappedArray to docs/source/api.rst (was exported but unlinked).
- Align error message style across new files: type(x).__name__ instead of
  type(x); include got <type> in three previously-bare TypeErrors in
  TextureObject.from_descriptor.

Refactor:
- Extract _get_current_context_ptr and _get_current_device_id to
  cuda_utils.{pxd,pyx} and share across all four new files (was
  duplicated four times). Generic error message keeps the helper
  reusable for the 9+ remaining duplicate sites in cuda.core.
- Hoist the buffer-protocol path in _fill_linear_endpoint into a new
  _fill_host_endpoint helper. Original function becomes a thin
  Buffer-vs-host router.
- Type Array._format and MipmappedArray._format as cydriver.CUarray_format
  instead of int (was a comment-typed int; now C-level type-checked).
- Drop unused `field` import from _texture.pyx.

Tests (+28, total 62 in this file):
- Undersized host/device buffer rejection in Array.copy_from/copy_to.
- ResourceDescriptor.from_linear rejects size_bytes=0 and non-multiples.
- _normalize_address_modes unit tests now make explicit assertions
  instead of only smoke-testing TextureObject creation.
- Negative-path coverage for Array.from_descriptor (bad format, non-
  iterable shape, zero dim), MipmappedArray.from_descriptor, all
  TextureObject.from_descriptor validation branches (filter_mode,
  read_mode, mipmap_filter_mode, max_anisotropy, border_color length),
  address-mode normalization (scalar non-AddressMode, empty/4-entry
  tuples, mixed-type entries), ResourceDescriptor.from_pitch2d, and
  copy_from/copy_to non-Stream rejection.
- TextureObject and SurfaceObject keepalive lifetime tests verifying
  the _source_ref chain holds after gc.collect() (mirrors the existing
  MipmappedArray level keepalive test).
- copy_from must not mutate the source buffer (round-trip test now
  also asserts list(src) is unchanged).

Example:
- texture_sample.py uses `with` blocks for Array and TextureObject so
  the user-facing demo shows the idiomatic context-manager pattern
  rather than manual try/finally.

Full cuda_core suite: 3326 passed, 199 skipped, 2 xfailed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_array.pxd            |   2 +-
 cuda_core/cuda/core/_array.pyx            | 126 ++++---
 cuda_core/cuda/core/_mipmapped_array.pxd  |   2 +-
 cuda_core/cuda/core/_mipmapped_array.pyx  |  38 +-
 cuda_core/cuda/core/_surface.pyx          |  47 +--
 cuda_core/cuda/core/_texture.pyx          | 178 +++++----
 cuda_core/cuda/core/_utils/cuda_utils.pxd |   8 +-
 cuda_core/cuda/core/_utils/cuda_utils.pyx |  21 ++
 cuda_core/docs/source/api.rst             |   6 +-
 cuda_core/examples/texture_sample.py      |  64 ++--
 cuda_core/tests/test_texture_surface.py   | 433 +++++++++++++++++++++-
 11 files changed, 698 insertions(+), 227 deletions(-)

diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
index 49ae1075d8b..73529cac48e 100644
--- a/cuda_core/cuda/core/_array.pxd
+++ b/cuda_core/cuda/core/_array.pxd
@@ -11,7 +11,7 @@ cdef class Array:
     cdef:
         cydriver.CUarray _handle
         tuple _shape                 # (w,), (w, h), or (w, h, d)
-        int _format                  # CUarray_format value
+        cydriver.CUarray_format _format
         unsigned int _num_channels   # 1, 2, or 4
         int _device_id
         intptr_t _context
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 37c5439ddb3..7d02dcd5d21 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -11,7 +11,11 @@ from libc.string cimport memset
 from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer
 from cuda.core._stream cimport Stream
-from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
 
 import enum
 
@@ -44,22 +48,6 @@ _FORMAT_ELEM_SIZE = {
 }
 
 
-cdef inline intptr_t _get_current_context_ptr() except? 0:
-    cdef cydriver.CUcontext ctx
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        raise RuntimeError("Array allocation requires an active CUDA context")
-    return <intptr_t>ctx
-
-
-cdef inline int _get_current_device_id() except -1:
-    cdef cydriver.CUdevice dev
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-    return <int>dev
-
-
 cdef void _fill_array_endpoint(
     cydriver.CUDA_MEMCPY3D* p, Array arr, bint is_src
 ) noexcept:
@@ -78,40 +66,20 @@ cdef void _fill_array_endpoint(
         p.dstZ = 0
 
 
-cdef int _fill_linear_endpoint(
+cdef int _fill_host_endpoint(
     cydriver.CUDA_MEMCPY3D* p,
     object obj,
     bint is_src,
     size_t width_bytes,
     size_t height,
+    size_t required,
     cpython.Py_buffer* pybuf_out,
 ) except -1:
-    """Populate the src or dst linear fields. Returns 1 if pybuf_out was
-    filled (caller must release it), 0 otherwise.
-    """
-    cdef intptr_t ptr
-    cdef int got_buffer = 0
-    if isinstance(obj, Buffer):
-        ptr = int((<Buffer>obj).handle)
-        if is_src:
-            p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
-            p.srcDevice = <cydriver.CUdeviceptr>ptr
-            p.srcPitch = width_bytes
-            p.srcHeight = height
-            p.srcXInBytes = 0
-            p.srcY = 0
-            p.srcZ = 0
-        else:
-            p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
-            p.dstDevice = <cydriver.CUdeviceptr>ptr
-            p.dstPitch = width_bytes
-            p.dstHeight = height
-            p.dstXInBytes = 0
-            p.dstY = 0
-            p.dstZ = 0
-        return 0
+    """Populate src/dst host fields from a buffer-protocol ``obj``.
 
-    # Treat anything else as a host buffer via the Python buffer protocol.
+    Acquires a Py_buffer view; the caller is responsible for releasing it
+    (this function always returns with the view held when it returns 1).
+    """
     cdef int flags = cpython.PyBUF_SIMPLE
     if not is_src:
         flags |= cpython.PyBUF_WRITABLE
@@ -120,7 +88,12 @@ cdef int _fill_linear_endpoint(
             f"Source/destination must be a Buffer or a contiguous "
             f"buffer-protocol object, got {type(obj).__name__}"
         )
-    got_buffer = 1
+    if <size_t>pybuf_out.len < required:
+        cpython.PyBuffer_Release(pybuf_out)
+        raise ValueError(
+            f"Host buffer has {pybuf_out.len} bytes, smaller than the array "
+            f"extent ({required} bytes)"
+        )
     if is_src:
         p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST
         p.srcHost = pybuf_out.buf
@@ -140,6 +113,49 @@ cdef int _fill_linear_endpoint(
     return 1
 
 
+cdef int _fill_linear_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    size_t depth,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate the src or dst linear fields. Returns 1 if pybuf_out was
+    filled (caller must release it), 0 otherwise.
+    """
+    cdef intptr_t ptr
+    cdef size_t required = width_bytes * height * depth
+    if isinstance(obj, Buffer):
+        if <size_t>(<Buffer>obj).size < required:
+            raise ValueError(
+                f"Buffer size ({(<Buffer>obj).size} bytes) is smaller than "
+                f"the array extent ({required} bytes)"
+            )
+        ptr = int((<Buffer>obj).handle)
+        if is_src:
+            p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.srcDevice = <cydriver.CUdeviceptr>ptr
+            p.srcPitch = width_bytes
+            p.srcHeight = height
+            p.srcXInBytes = 0
+            p.srcY = 0
+            p.srcZ = 0
+        else:
+            p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.dstDevice = <cydriver.CUdeviceptr>ptr
+            p.dstPitch = width_bytes
+            p.dstHeight = height
+            p.dstXInBytes = 0
+            p.dstY = 0
+            p.dstZ = 0
+        return 0
+    return _fill_host_endpoint(
+        p, obj, is_src, width_bytes, height, required, pybuf_out
+    )
+
+
 cdef _copy3d(Array arr, object other, object stream, bint to_array):
     """Issue a full-array async 3D memcpy between ``arr`` and ``other``.
 
@@ -164,13 +180,13 @@ cdef _copy3d(Array arr, object other, object stream, bint to_array):
     try:
         if to_array:
             got_buffer = _fill_linear_endpoint(
-                &params, other, True, width_bytes, height, &pybuf
+                &params, other, True, width_bytes, height, depth, &pybuf
             )
             _fill_array_endpoint(&params, arr, False)
         else:
             _fill_array_endpoint(&params, arr, True)
             got_buffer = _fill_linear_endpoint(
-                &params, other, False, width_bytes, height, &pybuf
+                &params, other, False, width_bytes, height, depth, &pybuf
             )
 
         stream_handle = int((<Stream>stream).handle)
@@ -223,14 +239,14 @@ cdef class Array:
         Array
         """
         if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format)}")
-        if num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
 
         try:
             shape_t = tuple(int(s) for s in shape)
         except TypeError as e:
-            raise TypeError(f"shape must be a tuple of ints, got {type(shape)}") from e
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
         if not 1 <= len(shape_t) <= 3:
             raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
         for i, dim in enumerate(shape_t):
@@ -240,7 +256,7 @@ cdef class Array:
         cdef Array self = cls.__new__(cls)
         self._owning = True
         self._shape = shape_t
-        self._format = int(format)
+        self._format = <cydriver.CUarray_format><int>format
         self._num_channels = num_channels
         self._surface_load_store = bool(surface_load_store)
         self._context = _get_current_context_ptr()
@@ -304,7 +320,7 @@ cdef class Array:
             self._shape = (int(desc.Width), int(desc.Height))
         else:
             self._shape = (int(desc.Width),)
-        self._format = <int>desc.Format
+        self._format = desc.Format
         self._num_channels = desc.NumChannels
         self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST)
         return self
@@ -393,12 +409,14 @@ cdef class Array:
 
     cpdef close(self):
         """Destroy the underlying ``CUarray`` if owned by this object."""
-        if self._handle != NULL and self._owning:
-            HANDLE_RETURN(cydriver.cuArrayDestroy(self._handle))
+        cdef cydriver.CUarray h = self._handle
+        cdef bint owning = self._owning
         self._handle = NULL
         # Drop the parent reference (if any) so a non-owning level Array
         # stops pinning its MipmappedArray after close().
         self._parent_ref = None
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuArrayDestroy(h))
 
     def __dealloc__(self):
         # Cython destructors cannot raise; any cuArrayDestroy error here is
diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd
index 52aa0dc863e..52afc1968cc 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pxd
+++ b/cuda_core/cuda/core/_mipmapped_array.pxd
@@ -11,7 +11,7 @@ cdef class MipmappedArray:
     cdef:
         cydriver.CUmipmappedArray _handle
         tuple _shape                 # (w,), (w, h), or (w, h, d)
-        int _format                  # CUarray_format value
+        cydriver.CUarray_format _format
         unsigned int _num_channels   # 1, 2, or 4
         unsigned int _num_levels
         int _device_id
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
index a4cfd40bb80..c149d907f62 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyx
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -10,23 +10,11 @@ from libc.string cimport memset
 from cuda.bindings cimport cydriver
 from cuda.core._array cimport Array
 from cuda.core._array import ArrayFormat
-from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-
-
-cdef inline intptr_t _get_current_context_ptr() except? 0:
-    cdef cydriver.CUcontext ctx
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        raise RuntimeError("MipmappedArray allocation requires an active CUDA context")
-    return <intptr_t>ctx
-
-
-cdef inline int _get_current_device_id() except -1:
-    cdef cydriver.CUdevice dev
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-    return <int>dev
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
 
 
 cdef class MipmappedArray:
@@ -77,14 +65,14 @@ cdef class MipmappedArray:
         MipmappedArray
         """
         if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format)}")
-        if num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
 
         try:
             shape_t = tuple(int(s) for s in shape)
         except TypeError as e:
-            raise TypeError(f"shape must be a tuple of ints, got {type(shape)}") from e
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
         if not 1 <= len(shape_t) <= 3:
             raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
         for i, dim in enumerate(shape_t):
@@ -98,7 +86,7 @@ cdef class MipmappedArray:
         cdef MipmappedArray self = cls.__new__(cls)
         self._owning = True
         self._shape = shape_t
-        self._format = int(format)
+        self._format = <cydriver.CUarray_format><int>format
         self._num_channels = num_channels
         self._num_levels = <unsigned int>levels
         self._surface_load_store = bool(surface_load_store)
@@ -212,9 +200,11 @@ cdef class MipmappedArray:
         After ``close()`` any level :class:`Array` returned by :meth:`get_level`
         becomes invalid; callers must not access them.
         """
-        if self._handle != NULL and self._owning:
-            HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(self._handle))
+        cdef cydriver.CUmipmappedArray h = self._handle
+        cdef bint owning = self._owning
         self._handle = NULL
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(h))
 
     def __dealloc__(self):
         # Cython destructors cannot raise; any cuMipmappedArrayDestroy error
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
index 46213eee17d..62cdecc9a01 100644
--- a/cuda_core/cuda/core/_surface.pyx
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -10,23 +10,11 @@ from libc.string cimport memset
 from cuda.bindings cimport cydriver
 from cuda.core._array cimport Array
 from cuda.core._texture import ResourceDescriptor
-from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-
-
-cdef inline intptr_t _get_current_context_ptr() except? 0:
-    cdef cydriver.CUcontext ctx
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        raise RuntimeError("SurfaceObject requires an active CUDA context")
-    return <intptr_t>ctx
-
-
-cdef inline int _get_current_device_id() except -1:
-    cdef cydriver.CUdevice dev
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-    return <int>dev
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
 
 
 cdef class SurfaceObject:
@@ -58,31 +46,31 @@ cdef class SurfaceObject:
         """
         if not isinstance(array, Array):
             raise TypeError(f"array must be an Array, got {type(array).__name__}")
-        return cls.from_descriptor(ResourceDescriptor.from_array(array))
+        return cls.from_descriptor(resource=ResourceDescriptor.from_array(array))
 
     @classmethod
-    def from_descriptor(cls, resource_desc):
+    def from_descriptor(cls, *, resource):
         """Create a surface object from a :class:`ResourceDescriptor`.
 
         Parameters
         ----------
-        resource_desc : ResourceDescriptor
+        resource : ResourceDescriptor
             Must wrap an :class:`Array` allocated with
             ``surface_load_store=True``. Linear/pitch2d resources are not
             valid surface backings.
         """
-        if not isinstance(resource_desc, ResourceDescriptor):
+        if not isinstance(resource, ResourceDescriptor):
             raise TypeError(
-                f"resource_desc must be a ResourceDescriptor, got "
-                f"{type(resource_desc).__name__}"
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
             )
-        if resource_desc.kind != "array":
+        if resource.kind != "array":
             raise ValueError(
                 f"SurfaceObject requires an array-backed ResourceDescriptor, "
-                f"got kind={resource_desc.kind!r}"
+                f"got kind={resource.kind!r}"
             )
 
-        cdef Array arr = <Array>resource_desc.source
+        cdef Array arr = <Array>resource.source
         if not arr.surface_load_store:
             raise ValueError(
                 "Array must be created with surface_load_store=True to be "
@@ -95,7 +83,7 @@ cdef class SurfaceObject:
         res_desc.res.array.hArray = arr._handle
 
         cdef SurfaceObject self = cls.__new__(cls)
-        self._source_ref = resource_desc
+        self._source_ref = resource
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
@@ -122,10 +110,11 @@ cdef class SurfaceObject:
 
     cpdef close(self):
         """Destroy the underlying ``CUsurfObject``."""
-        if self._handle != 0:
-            HANDLE_RETURN(cydriver.cuSurfObjectDestroy(self._handle))
+        cdef cydriver.CUsurfObject h = self._handle
         self._handle = 0
         self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuSurfObjectDestroy(h))
 
     def __dealloc__(self):
         # Cython destructors cannot raise; any cuSurfObjectDestroy error is
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 8fcc5586e8d..6ccffcadbb1 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -13,10 +13,14 @@ from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
 from cuda.core._memory._buffer cimport Buffer
 from cuda.core._mipmapped_array cimport MipmappedArray
 from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
-from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
 
 import enum
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
 
 # Driver texture-descriptor flag bits (CU_TRSF_*).
@@ -152,20 +156,28 @@ class ResourceDescriptor:
             raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
         if not isinstance(format, ArrayFormat):
             raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
 
         buf_size = int(buffer.size)
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
         if size_bytes is None:
             size = buf_size
         else:
             size = int(size_bytes)
-            if size < 0:
-                raise ValueError(f"size_bytes must be >= 0, got {size}")
             if size > buf_size:
                 raise ValueError(
                     f"size_bytes ({size}) exceeds buffer.size ({buf_size})"
                 )
+        if size < elem:
+            raise ValueError(
+                f"size_bytes ({size}) must be at least one element ({elem} bytes)"
+            )
+        if size % elem != 0:
+            raise ValueError(
+                f"size_bytes ({size}) must be a multiple of element size "
+                f"({elem} bytes for {format.name} x {num_channels})"
+            )
 
         self = cls.__new__(cls)
         self._kind = "linear"
@@ -206,8 +218,8 @@ class ResourceDescriptor:
             raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
         if not isinstance(format, ArrayFormat):
             raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
 
         w = int(width)
         h = int(height)
@@ -256,6 +268,26 @@ class ResourceDescriptor:
         """Channels per element (``None`` for array-backed)."""
         return self._num_channels
 
+    @property
+    def size_bytes(self):
+        """Bytes bound for a linear resource (``None`` for other kinds)."""
+        return self._size_bytes
+
+    @property
+    def width(self):
+        """Pitch2D image width, in elements (``None`` for other kinds)."""
+        return self._width
+
+    @property
+    def height(self):
+        """Pitch2D image height, in rows (``None`` for other kinds)."""
+        return self._height
+
+    @property
+    def pitch_bytes(self):
+        """Pitch2D row pitch, in bytes (``None`` for other kinds)."""
+        return self._pitch_bytes
+
     def __repr__(self):
         if self._kind == "linear":
             return (
@@ -320,23 +352,7 @@ class TextureDescriptor:
     border_color: tuple | None = None
 
 
-cdef inline intptr_t _get_current_context_ptr() except? 0:
-    cdef cydriver.CUcontext ctx
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        raise RuntimeError("TextureObject requires an active CUDA context")
-    return <intptr_t>ctx
-
-
-cdef inline int _get_current_device_id() except -1:
-    cdef cydriver.CUdevice dev
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-    return <int>dev
-
-
-cdef _normalize_address_modes(address_mode):
+def _normalize_address_modes(address_mode):
     """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
     if isinstance(address_mode, AddressMode):
         return (address_mode, address_mode, address_mode)
@@ -378,23 +394,23 @@ cdef class TextureObject:
         )
 
     @classmethod
-    def from_descriptor(cls, resource_desc, texture_desc):
+    def from_descriptor(cls, *, resource, texture_descriptor):
         """Create a texture object from a resource + sampling descriptor.
 
         Parameters
         ----------
-        resource_desc : ResourceDescriptor
-        texture_desc : TextureDescriptor
+        resource : ResourceDescriptor
+        texture_descriptor : TextureDescriptor
         """
-        if not isinstance(resource_desc, ResourceDescriptor):
+        if not isinstance(resource, ResourceDescriptor):
             raise TypeError(
-                f"resource_desc must be a ResourceDescriptor, got "
-                f"{type(resource_desc).__name__}"
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
             )
-        if not isinstance(texture_desc, TextureDescriptor):
+        if not isinstance(texture_descriptor, TextureDescriptor):
             raise TypeError(
-                f"texture_desc must be a TextureDescriptor, got "
-                f"{type(texture_desc).__name__}"
+                f"texture_descriptor must be a TextureDescriptor, got "
+                f"{type(texture_descriptor).__name__}"
             )
 
         cdef cydriver.CUDA_RESOURCE_DESC res_desc
@@ -407,82 +423,91 @@ cdef class TextureObject:
         cdef MipmappedArray mip
         cdef Buffer buf
         cdef intptr_t devptr
-        if resource_desc.kind == "array":
-            arr = <Array>resource_desc.source
+        if resource.kind == "array":
+            arr = <Array>resource.source
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
             res_desc.res.array.hArray = arr._handle
-        elif resource_desc.kind == "mipmapped_array":
-            mip = <MipmappedArray>resource_desc.source
+        elif resource.kind == "mipmapped_array":
+            mip = <MipmappedArray>resource.source
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
             res_desc.res.mipmap.hMipmappedArray = mip._handle
-        elif resource_desc.kind == "linear":
-            buf = <Buffer>resource_desc.source
+        elif resource.kind == "linear":
+            buf = <Buffer>resource.source
             devptr = int(buf.handle)
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR
             res_desc.res.linear.devPtr = <cydriver.CUdeviceptr>devptr
-            res_desc.res.linear.format = <cydriver.CUarray_format><int>resource_desc._format
-            res_desc.res.linear.numChannels = <unsigned int>resource_desc._num_channels
-            res_desc.res.linear.sizeInBytes = <size_t>resource_desc._size_bytes
-        elif resource_desc.kind == "pitch2d":
-            buf = <Buffer>resource_desc.source
+            res_desc.res.linear.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.linear.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.linear.sizeInBytes = <size_t>resource._size_bytes
+        elif resource.kind == "pitch2d":
+            buf = <Buffer>resource.source
             devptr = int(buf.handle)
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D
             res_desc.res.pitch2D.devPtr = <cydriver.CUdeviceptr>devptr
-            res_desc.res.pitch2D.format = <cydriver.CUarray_format><int>resource_desc._format
-            res_desc.res.pitch2D.numChannels = <unsigned int>resource_desc._num_channels
-            res_desc.res.pitch2D.width = <size_t>resource_desc._width
-            res_desc.res.pitch2D.height = <size_t>resource_desc._height
-            res_desc.res.pitch2D.pitchInBytes = <size_t>resource_desc._pitch_bytes
+            res_desc.res.pitch2D.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.pitch2D.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.pitch2D.width = <size_t>resource._width
+            res_desc.res.pitch2D.height = <size_t>resource._height
+            res_desc.res.pitch2D.pitchInBytes = <size_t>resource._pitch_bytes
         else:
             raise NotImplementedError(
-                f"ResourceDescriptor kind {resource_desc.kind!r} is not yet supported"
+                f"ResourceDescriptor kind {resource.kind!r} is not yet supported"
             )
 
         # --- Texture descriptor ---
-        modes = _normalize_address_modes(texture_desc.address_mode)
+        modes = _normalize_address_modes(texture_descriptor.address_mode)
         tex_desc.addressMode[0] = <cydriver.CUaddress_mode><int>modes[0]
         tex_desc.addressMode[1] = <cydriver.CUaddress_mode><int>modes[1]
         tex_desc.addressMode[2] = <cydriver.CUaddress_mode><int>modes[2]
 
-        if not isinstance(texture_desc.filter_mode, FilterMode):
-            raise TypeError("filter_mode must be a FilterMode")
-        tex_desc.filterMode = <cydriver.CUfilter_mode><int>texture_desc.filter_mode
+        if not isinstance(texture_descriptor.filter_mode, FilterMode):
+            raise TypeError(
+                f"filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.filter_mode).__name__}"
+            )
+        tex_desc.filterMode = <cydriver.CUfilter_mode><int>texture_descriptor.filter_mode
 
-        if not isinstance(texture_desc.read_mode, ReadMode):
-            raise TypeError("read_mode must be a ReadMode")
+        if not isinstance(texture_descriptor.read_mode, ReadMode):
+            raise TypeError(
+                f"read_mode must be a ReadMode, got "
+                f"{type(texture_descriptor.read_mode).__name__}"
+            )
 
         cdef unsigned int flags = 0
         # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to
         # ReadMode.ELEMENT_TYPE.
-        if texture_desc.read_mode == ReadMode.ELEMENT_TYPE:
+        if texture_descriptor.read_mode == ReadMode.ELEMENT_TYPE:
             flags |= _TRSF_READ_AS_INTEGER
-        if texture_desc.normalized_coords:
+        if texture_descriptor.normalized_coords:
             flags |= _TRSF_NORMALIZED_COORDINATES
-        if texture_desc.srgb:
+        if texture_descriptor.srgb:
             flags |= _TRSF_SRGB
-        if texture_desc.disable_trilinear_optimization:
+        if texture_descriptor.disable_trilinear_optimization:
             flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION
-        if texture_desc.seamless_cubemap:
+        if texture_descriptor.seamless_cubemap:
             flags |= _TRSF_SEAMLESS_CUBEMAP
         tex_desc.flags = flags
 
-        if texture_desc.max_anisotropy < 0:
+        if texture_descriptor.max_anisotropy < 0:
             raise ValueError("max_anisotropy must be >= 0")
-        tex_desc.maxAnisotropy = <unsigned int>texture_desc.max_anisotropy
+        tex_desc.maxAnisotropy = <unsigned int>texture_descriptor.max_anisotropy
 
-        if not isinstance(texture_desc.mipmap_filter_mode, FilterMode):
-            raise TypeError("mipmap_filter_mode must be a FilterMode")
-        tex_desc.mipmapFilterMode = <cydriver.CUfilter_mode><int>texture_desc.mipmap_filter_mode
-        tex_desc.mipmapLevelBias = <float>texture_desc.mipmap_level_bias
-        tex_desc.minMipmapLevelClamp = <float>texture_desc.min_mipmap_level_clamp
-        tex_desc.maxMipmapLevelClamp = <float>texture_desc.max_mipmap_level_clamp
+        if not isinstance(texture_descriptor.mipmap_filter_mode, FilterMode):
+            raise TypeError(
+                f"mipmap_filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.mipmap_filter_mode).__name__}"
+            )
+        tex_desc.mipmapFilterMode = <cydriver.CUfilter_mode><int>texture_descriptor.mipmap_filter_mode
+        tex_desc.mipmapLevelBias = <float>texture_descriptor.mipmap_level_bias
+        tex_desc.minMipmapLevelClamp = <float>texture_descriptor.min_mipmap_level_clamp
+        tex_desc.maxMipmapLevelClamp = <float>texture_descriptor.max_mipmap_level_clamp
 
         cdef int i
-        if texture_desc.border_color is None:
+        if texture_descriptor.border_color is None:
             for i in range(4):
                 tex_desc.borderColor[i] = 0.0
         else:
-            bc = tuple(texture_desc.border_color)
+            bc = tuple(texture_descriptor.border_color)
             if len(bc) != 4:
                 raise ValueError(
                     f"border_color must have 4 elements, got {len(bc)}"
@@ -491,8 +516,8 @@ cdef class TextureObject:
                 tex_desc.borderColor[i] = <float>bc[i]
 
         cdef TextureObject self = cls.__new__(cls)
-        self._source_ref = resource_desc
-        self._texture_desc = texture_desc
+        self._source_ref = resource
+        self._texture_desc = texture_descriptor
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
@@ -524,10 +549,11 @@ cdef class TextureObject:
 
     cpdef close(self):
         """Destroy the underlying ``CUtexObject``."""
-        if self._handle != 0:
-            HANDLE_RETURN(cydriver.cuTexObjectDestroy(self._handle))
+        cdef cydriver.CUtexObject h = self._handle
         self._handle = 0
         self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuTexObjectDestroy(h))
 
     def __dealloc__(self):
         # Cython destructors cannot raise; any cuTexObjectDestroy error is
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index 4562cd71355..a8115aaf3f9 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -4,7 +4,7 @@
 
 cimport cpython
 from cpython.object cimport PyObject
-from libc.stdint cimport int64_t, int32_t, uint8_t, uint16_t, uint32_t
+from libc.stdint cimport int64_t, int32_t, intptr_t, uint8_t, uint16_t, uint32_t
 
 from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink
 
@@ -25,6 +25,12 @@ cdef int HANDLE_RETURN_NVJITLINK(
     cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil
 
 
+# Helpers for retrieving the current CUDA context and device. Raise if no
+# active context is bound to the calling thread.
+cdef intptr_t _get_current_context_ptr() except? 0
+cdef int _get_current_device_id() except? -1
+
+
 # TODO: stop exposing these within the codebase?
 cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil
 cpdef int _check_runtime_error(error) except?-1
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 1bcfa524884..9ffaf3531ff 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -66,6 +66,27 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil:
     return 0
 
 
+cdef intptr_t _get_current_context_ptr() except? 0:
+    """Return the current thread's bound CUcontext as an intptr_t.
+
+    Raises ``RuntimeError`` if no context is current.
+    """
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("an active CUDA context is required")
+    return <intptr_t>ctx
+
+
+cdef int _get_current_device_id() except? -1:
+    """Return the current thread's bound CUdevice ordinal."""
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil:
     """Handle NVRTC result codes, raising NVRTCError with program log on failure."""
     if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index d3c9b761510..7c1d33e3393 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -165,8 +165,9 @@ Textures and surfaces
 CUDA arrays back bindless texture and surface objects for kernel-side sampled
 reads and typed load/store. :class:`Array` is allocated through
 :meth:`Array.from_descriptor` and bound through a :class:`ResourceDescriptor`
-factory; linear (1D) and row-pitched 2D :class:`Buffer` views are also
-supported as texture backings.
+factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as
+mipmapped allocations (:class:`MipmappedArray`) are also supported as texture
+backings.
 
 .. autosummary::
    :toctree: generated/
@@ -174,6 +175,7 @@ supported as texture backings.
    :template: autosummary/cyclass.rst
 
    Array
+   MipmappedArray
    ResourceDescriptor
    TextureObject
    SurfaceObject
diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
index 68e4a964f1c..fc5b05f086f 100644
--- a/cuda_core/examples/texture_sample.py
+++ b/cuda_core/examples/texture_sample.py
@@ -61,8 +61,6 @@ def main():
     dev.set_current()
     stream = dev.create_stream()
 
-    arr = None
-    tex = None
     coords_buf = None
     out_buf = None
     pinned_mr = LegacyPinnedMemoryResource()
@@ -72,33 +70,46 @@ def main():
         # buffer fed into copy_from must be laid out as H rows of W elements
         # (row-major), i.e. host_pattern.shape == (H, W).
         width, height = 16, 16
-        arr = Array.from_descriptor(
+        with Array.from_descriptor(
             shape=(width, height),
             format=ArrayFormat.FLOAT32,
             num_channels=1,
-        )
+        ) as arr:
+            # Plant a known pattern: pattern[y, x] = x + 100*y.
+            # Cast to float32 so the byte count matches the array's storage.
+            ys, xs = np.meshgrid(
+                np.arange(height, dtype=np.float32),
+                np.arange(width, dtype=np.float32),
+                indexing="ij",
+            )
+            pattern = (xs + 100.0 * ys).astype(np.float32)
+            assert pattern.shape == (height, width)
+            arr.copy_from(pattern, stream=stream)
+
+            # Build a linear-filtering, clamped, non-normalized texture.
+            res_desc = ResourceDescriptor.from_array(arr)
+            tex_desc = TextureDescriptor(
+                address_mode=AddressMode.CLAMP,
+                filter_mode=FilterMode.LINEAR,
+                read_mode=ReadMode.ELEMENT_TYPE,
+                normalized_coords=False,
+            )
+            with TextureObject.from_descriptor(
+                resource=res_desc, texture_descriptor=tex_desc
+            ) as tex:
+                _run_kernel_and_verify(
+                    dev, stream, tex, pattern, width, height, pinned_mr
+                )
+    finally:
+        stream.close()
 
-        # Plant a known pattern: pattern[y, x] = x + 100*y.
-        # Cast to float32 so the byte count matches the array's storage.
-        ys, xs = np.meshgrid(
-            np.arange(height, dtype=np.float32),
-            np.arange(width, dtype=np.float32),
-            indexing="ij",
-        )
-        pattern = (xs + 100.0 * ys).astype(np.float32)
-        assert pattern.shape == (height, width)
-        arr.copy_from(pattern, stream=stream)
-
-        # Build a linear-filtering, clamped, non-normalized texture.
-        res_desc = ResourceDescriptor.from_array(arr)
-        tex_desc = TextureDescriptor(
-            address_mode=AddressMode.CLAMP,
-            filter_mode=FilterMode.LINEAR,
-            read_mode=ReadMode.ELEMENT_TYPE,
-            normalized_coords=False,
-        )
-        tex = TextureObject.from_descriptor(res_desc, tex_desc)
 
+def _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr):
+    """Kernel launch + correctness check, isolated so the with-blocks in main()
+    stay readable. Owns its own pinned-buffer cleanup."""
+    coords_buf = None
+    out_buf = None
+    try:
         # Build the test coordinate list:
         # - Texel-center samples should return the exact planted value.
         # - Half-integer samples land between texels and exercise LINEAR
@@ -199,15 +210,10 @@ def main():
         print(f"  texel-center samples verified: {n_center}")
         print(f"  half-integer samples verified: {len(half_samples)}")
     finally:
-        if tex is not None:
-            tex.close()
-        if arr is not None:
-            arr.close()
         if coords_buf is not None:
             coords_buf.close()
         if out_buf is not None:
             out_buf.close()
-        stream.close()
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index e9a3d3d6bb6..00e67ed2398 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -99,12 +99,53 @@ def test_array_roundtrip_copy(init_cuda):
         arr.copy_from(src, stream=stream)
         arr.copy_to(dst, stream=stream)
         stream.sync()
+        # Round-trip recovers data; src must not be mutated by copy_from.
         assert list(dst) == list(range(16))
+        assert list(src) == list(range(16))
     finally:
         arr.close()
         stream.close()
 
 
+def test_array_copy_rejects_undersized_host_buffer(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    try:
+        # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer.
+        too_small = _array.array("I", [0] * 8)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(too_small, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(too_small, stream=stream)
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_array_copy_rejects_undersized_device_buffer(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    # arr is 64 bytes; allocate a 32-byte device buffer.
+    small_buf = device.memory_resource.allocate(32, stream=device.default_stream)
+    try:
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(small_buf, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(small_buf, stream=stream)
+    finally:
+        small_buf.close()
+        arr.close()
+        stream.close()
+
+
 def test_texture_object_create(init_cuda):
     arr = Array.from_descriptor(
         shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
@@ -117,7 +158,7 @@ def test_texture_object_create(init_cuda):
             read_mode=ReadMode.ELEMENT_TYPE,
             normalized_coords=True,
         )
-        tex = TextureObject.from_descriptor(res, tex_desc)
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
         try:
             assert tex.handle != 0
             assert tex.resource is res
@@ -158,17 +199,34 @@ def test_surface_requires_ldst_flag(init_cuda):
 
 
 def test_address_mode_normalization(init_cuda):
+    # Direct unit test of the private normalizer: a scalar should expand to a
+    # 3-tuple; a shorter tuple should be padded by repeating the last entry.
+    from cuda.core._texture import _normalize_address_modes
+
+    assert _normalize_address_modes(AddressMode.WRAP) == (
+        AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP,
+    )
+    assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == (
+        AddressMode.WRAP, AddressMode.CLAMP, AddressMode.CLAMP,
+    )
+    assert _normalize_address_modes(
+        (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
+    ) == (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
+
+    # Smoke test: a 2-entry tuple is also accepted end-to-end.
     arr = Array.from_descriptor(
         shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
         res = ResourceDescriptor.from_array(arr)
-        # Per-axis tuple shorter than 3 should be accepted and padded.
         tex_desc = TextureDescriptor(
             address_mode=(AddressMode.WRAP, AddressMode.CLAMP)
         )
-        tex = TextureObject.from_descriptor(res, tex_desc)
-        tex.close()
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
     finally:
         arr.close()
 
@@ -240,6 +298,31 @@ def test_resource_descriptor_from_linear_rejects_non_buffer():
         )
 
 
+def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="at least one element"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple.
+        with pytest.raises(ValueError, match="multiple of element size"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10
+            )
+    finally:
+        buf.close()
+
+
 def test_texture_object_from_linear(init_cuda):
     """A linear-backed texture should bind even though sampling fields are
     effectively ignored by the driver."""
@@ -250,7 +333,7 @@ def test_texture_object_from_linear(init_cuda):
         res = ResourceDescriptor.from_linear(
             buf, format=ArrayFormat.FLOAT32, num_channels=1
         )
-        tex = TextureObject.from_descriptor(res, TextureDescriptor())
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
         try:
             assert tex.handle != 0
             assert tex.resource is res
@@ -320,7 +403,7 @@ def test_texture_object_from_pitch2d(init_cuda):
         )
         assert res.kind == "pitch2d"
         assert "pitch2d" in repr(res)
-        tex = TextureObject.from_descriptor(res, TextureDescriptor())
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
         try:
             assert tex.handle != 0
         finally:
@@ -337,7 +420,7 @@ def test_surface_rejects_linear_and_pitch2d(init_cuda):
             buf, format=ArrayFormat.UINT32, num_channels=1
         )
         with pytest.raises(ValueError, match="array-backed"):
-            SurfaceObject.from_descriptor(res_lin)
+            SurfaceObject.from_descriptor(resource=res_lin)
 
         res_p2 = ResourceDescriptor.from_pitch2d(
             buf,
@@ -348,7 +431,7 @@ def test_surface_rejects_linear_and_pitch2d(init_cuda):
             pitch_bytes=64,
         )
         with pytest.raises(ValueError, match="array-backed"):
-            SurfaceObject.from_descriptor(res_p2)
+            SurfaceObject.from_descriptor(resource=res_p2)
     finally:
         buf.close()
 
@@ -493,7 +576,7 @@ def test_texture_object_from_mipmapped_array(init_cuda):
             min_mipmap_level_clamp=0.0,
             max_mipmap_level_clamp=float(mip.num_levels - 1),
         )
-        tex = TextureObject.from_descriptor(res, tex_desc)
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
         try:
             assert tex.handle != 0
             assert tex.resource is res
@@ -514,7 +597,7 @@ def test_surface_rejects_mipmapped_array(init_cuda):
     try:
         res = ResourceDescriptor.from_mipmapped_array(mip)
         with pytest.raises(ValueError, match="array-backed"):
-            SurfaceObject.from_descriptor(res)
+            SurfaceObject.from_descriptor(resource=res)
     finally:
         mip.close()
 
@@ -553,3 +636,333 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
     # Closing the level drops its parent ref. Don't access the parent past
     # this point; cuMipmappedArrayDestroy may then run.
     lvl.close()
+
+
+# --- Negative-path validation tests ------------------------------------------
+
+def test_array_from_descriptor_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        Array.from_descriptor(shape=(8,), format=0, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda):
+    with pytest.raises(TypeError, match="shape must be a tuple"):
+        Array.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"):
+        Array.from_descriptor(
+            shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_array_copy_rejects_non_stream(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8,), format=ArrayFormat.UINT8, num_channels=1
+    )
+    try:
+        import array as _array
+        buf = _array.array("B", [0] * 8)
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_from(buf, stream="not-a-stream")
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_to(buf, stream="not-a-stream")
+    finally:
+        arr.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_non_buffer():
+    with pytest.raises(TypeError, match="buffer must be a Buffer"):
+        ResourceDescriptor.from_pitch2d(
+            object(),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_format(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=0,
+                num_channels=1,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=3,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="width"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=0,
+                height=8,
+                pitch_bytes=64,
+            )
+        with pytest.raises(ValueError, match="height"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=8,
+                height=0,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_mipmapped_array_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8), format=0, num_channels=1, num_levels=2
+        )
+
+
+def test_mipmapped_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2
+        )
+
+
+def test_mipmapped_array_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"):
+        MipmappedArray.from_descriptor(
+            shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1
+        )
+
+
+def test_texture_object_rejects_non_resource_descriptor(init_cuda):
+    with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"):
+        TextureObject.from_descriptor(
+            resource=object(), texture_descriptor=TextureDescriptor()
+        )
+
+
+def test_texture_object_rejects_non_texture_descriptor(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        with pytest.raises(
+            TypeError, match="texture_descriptor must be a TextureDescriptor"
+        ):
+            TextureObject.from_descriptor(resource=res, texture_descriptor="nope")
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_filter_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(filter_mode=0)  # int, not FilterMode
+        with pytest.raises(TypeError, match="filter_mode must be a FilterMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_read_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(read_mode=0)  # int, not ReadMode
+        with pytest.raises(TypeError, match="read_mode must be a ReadMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(mipmap_filter_mode=0)  # int, not FilterMode
+        with pytest.raises(
+            TypeError, match="mipmap_filter_mode must be a FilterMode"
+        ):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_negative_anisotropy(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(max_anisotropy=-1)
+        with pytest.raises(ValueError, match="max_anisotropy"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_border_color_length(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(border_color=(0.0, 0.0))  # length 2, not 4
+        with pytest.raises(ValueError, match="border_color must have 4"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=42)  # int, not AddressMode / iterable
+        with pytest.raises(TypeError, match="address_mode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_empty_tuple(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=())
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_too_long_tuple(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(
+            address_mode=(
+                AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP
+            )
+        )
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_entry(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP))
+        with pytest.raises(TypeError, match=r"address_mode\[1\]"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_keeps_backing_array_alive(init_cuda):
+    """Dropping the local references to the backing Array and the
+    ResourceDescriptor must NOT invalidate an existing TextureObject. The
+    TextureObject holds a strong ref through its _source_ref slot."""
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    res = ResourceDescriptor.from_array(arr)
+    tex = TextureObject.from_descriptor(
+        resource=res, texture_descriptor=TextureDescriptor()
+    )
+    # Verify the keepalive chain via gc referents: TextureObject -> _source_ref
+    # -> ResourceDescriptor -> _source -> Array. We can only walk one level
+    # at a time, so check tex's referents include the ResourceDescriptor.
+    arr_id = id(arr)
+    res_id = id(res)
+    del arr, res
+    gc.collect()
+
+    referents = gc.get_referents(tex)
+    res_refs = [r for r in referents if id(r) == res_id]
+    assert len(res_refs) == 1, (
+        f"TextureObject should still reference the ResourceDescriptor; "
+        f"got referents {referents!r}"
+    )
+    res_back = res_refs[0]
+    arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id]
+    assert len(arr_refs) == 1, "ResourceDescriptor should still reference its Array"
+
+    # tex.handle should still be valid (non-zero).
+    assert tex.handle != 0
+    tex.close()
+
+
+def test_surface_object_keeps_backing_array_alive(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    surf = SurfaceObject.from_array(arr)
+    arr_id = id(arr)
+    del arr
+    gc.collect()
+
+    # The surface keeps the ResourceDescriptor alive, which keeps the Array
+    # alive. We verify the chain end-to-end the same way as the texture case.
+    referents = gc.get_referents(surf)
+    res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)]
+    assert len(res_objs) == 1
+    arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id]
+    assert len(arr_refs) == 1, (
+        "SurfaceObject should still reference its backing Array via the ResourceDescriptor"
+    )
+    assert surf.handle != 0
+    surf.close()

From cd68c994b51212d6190f02a7d6dc5bbca0611456 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 15 May 2026 16:06:12 -0700
Subject: [PATCH 10/17] Add 9 cuda.core texture/surface examples (refs #467)

These graphical examples demonstrate the new Array, TextureObject,
SurfaceObject, MipmappedArray, and ResourceDescriptor APIs in increasing
order of complexity. All use the existing GraphicsResource + GL PBO
pattern for display (matching gl_interop_plasma.py); CI is gated on
has_display so headless runners skip them.

Minimum-API examples:
- gl_interop_image_show.py    Hello-world for the stack: 64x64 Array,
                              TextureObject, key F toggles POINT/LINEAR.
                              Read this file first.
- gl_interop_texture_filter.py POINT vs LINEAR side-by-side on one Array
                              with two TextureObjects; mouse pan/zoom,
                              key M cycles AddressMode.

Simulation examples (Array + SurfaceObject + TextureObject ping-pong):
- gl_interop_reaction_diffusion.py Gray-Scott with FLOAT32 x 2 channels;
                              LINEAR + WRAP for toroidal diffusion.
- gl_interop_lenia.py         Continuous-state CA with bell-curve
                              convolution; FLOAT32 x 1 channel.
- gl_interop_fire.py          Canonical Doom fire (37-color indexed
                              palette, UINT8 intensity 0..36, gather
                              equivalent of the original scatter
                              algorithm); exercises ArrayFormat.UINT8.
- gl_interop_ocean.py         Animated Gerstner-wave ocean with normal
                              mapping via finite-difference texture
                              reads and Phong + Fresnel shading.

Visualization examples:
- gl_interop_mandelbrot.py    Real-time deep-zoom using a 1D Array as
                              a color LUT (TextureObject for palette
                              lookup, not simulation).
- gl_interop_mipmap_lod.py    Procedural mipmap pyramid built with a
                              SurfaceObject per level; trilinear
                              sampling via tex2DLod and TextureDescriptor
                              mipmap fields.
- gl_interop_sdf_volume.py    3D ray-marched gyroid via a 128^3 Array,
                              surf3Dwrite for bake, tex3D for trilinear
                              SDF sampling. Only example exercising the
                              3D side of the API.

Every public symbol added in this PR is exercised by at least one of
these examples.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/examples/gl_interop_fire.py         | 774 ++++++++++++++++
 cuda_core/examples/gl_interop_image_show.py   | 428 +++++++++
 cuda_core/examples/gl_interop_lenia.py        | 805 +++++++++++++++++
 cuda_core/examples/gl_interop_mandelbrot.py   | 692 +++++++++++++++
 cuda_core/examples/gl_interop_mipmap_lod.py   | 717 +++++++++++++++
 cuda_core/examples/gl_interop_ocean.py        | 836 ++++++++++++++++++
 .../examples/gl_interop_reaction_diffusion.py | 727 +++++++++++++++
 cuda_core/examples/gl_interop_sdf_volume.py   | 827 +++++++++++++++++
 .../examples/gl_interop_texture_filter.py     | 607 +++++++++++++
 .../example_tests/test_basic_examples.py      |   9 +
 10 files changed, 6422 insertions(+)
 create mode 100644 cuda_core/examples/gl_interop_fire.py
 create mode 100644 cuda_core/examples/gl_interop_image_show.py
 create mode 100644 cuda_core/examples/gl_interop_lenia.py
 create mode 100644 cuda_core/examples/gl_interop_mandelbrot.py
 create mode 100644 cuda_core/examples/gl_interop_mipmap_lod.py
 create mode 100644 cuda_core/examples/gl_interop_ocean.py
 create mode 100644 cuda_core/examples/gl_interop_reaction_diffusion.py
 create mode 100644 cuda_core/examples/gl_interop_sdf_volume.py
 create mode 100644 cuda_core/examples/gl_interop_texture_filter.py

diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
new file mode 100644
index 00000000000..c8f2c9165b6
--- /dev/null
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -0,0 +1,774 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop: a classic
+# "Doom-style" procedural fire effect. A scalar heat field lives on a
+# ping-ponged float CUDA Array; each frame the field is advected upward with a
+# horizontal jitter and a small decay, then colorized through a 1D fire-palette
+# TextureObject straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to combine a 2D float Array (the heat field) and a 1D RGBA8 Array (the
+#   color palette) under the same texture/surface API.
+# - How to ping-pong a scalar field via Array + SurfaceObject writes and
+#   TextureObject reads, similar to the reaction-diffusion example but with a
+#   single channel.
+# - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a
+#   tex1D<float4> lookup returns RGBA in [0, 1] -- no manual unpacking needed.
+# - How to wire mouse / keyboard events into a CUDA simulation without
+#   blocking the event loop.
+#
+# How it works
+# ============
+# The heat field is a WIDTH x HEIGHT scalar in [0, 1]. Each frame we:
+#
+#   1. step kernel: for every pixel,
+#        - if y is near the bottom AND ambient injection is on, write random
+#          high heat ("the embers");
+#        - if the mouse button is held, paint a hot disc near the cursor;
+#        - otherwise read a horizontally-jittered sample from the row "below"
+#          (i.e. one texel toward the bottom of the screen) and subtract a
+#          small decay. This is what creates the upward-flickering motion.
+#   2. colorize kernel: per pixel, sample the heat, look it up in a 1D RGBA8
+#      fire palette via tex1D<float4>, and write RGBA bytes into the PBO.
+#
+#   PING-PONG (two single-channel float Arrays)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-------------+   tex2D<float>    +-------------+
+#   |   heat_a    | ----------------> |             |
+#   | (FLOAT32 x1)|                   |  step_fire  |
+#   +-------------+                   |   kernel    |
+#                                     |             |
+#   +-------------+   surf2Dwrite     |             |
+#   |   heat_b    | <---------------- |             |
+#   | (FLOAT32 x1)|                   +-------------+
+#   +-------------+
+#       (swap)
+#
+# Orientation
+# -----------
+# OpenGL displays texel row 0 at the bottom of the window. The fullscreen quad
+# in create_display_resources() flips t so that kernel y=0 lands at the TOP of
+# the screen -- this lets the kernel keep the intuitive "inject at y = h-1,
+# advect from y+1 -> y" convention while the visible flames rise upward.
+# Mouse coordinates from pyglet (y=0 at window bottom) are flipped to the
+# kernel's y-down convention on entry.
+#
+# surf2Dwrite x-in-bytes
+# ----------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# float surface that means `x * sizeof(float)` = `x * 4`. Getting this wrong
+# silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A flickering wall of doom-style fire rising from the bottom of the window.
+# Hold the mouse button and drag to paint a torch of heat at the cursor.
+# Press SPACE to toggle the ambient embers along the bottom row (the fire
+# will die out when ambient is OFF). Press R to clear the heat field.
+# Press Escape or close the window to exit. The window title shows FPS and
+# whether ambient injection is currently on.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+# Window dimensions (what the user sees).
+WINDOW_WIDTH = 640
+WINDOW_HEIGHT = 480
+
+# Simulation dimensions (the heat-field grid). Doom's actual screen was
+# 320x200; we use 320x100 so the canonical decay rate of ~1 intensity unit
+# per row (random {0, 1, 2}, average 1) produces flames that reach ~36% of
+# the screen height -- the recognizable "tall licking flames" look.
+# NEAREST-filtered upscale to the 640x480 window stretches vertically 4.8x,
+# giving the chunky retro pixel-doubled appearance.
+WIDTH = 320
+HEIGHT = 100
+
+# Canonical Doom fire palette: 37 hand-tuned colors (intensity 0..36 -> RGB).
+# Source: https://github.com/tiagomenegaz/doom-fire (and Fabien Sanglard's
+# analysis of the original PSX Doom fire effect).
+PALETTE_SIZE = 37
+MAX_INTENSITY = 36
+TORCH_RADIUS = 12  # pixel radius of the mouse-painted hot disc (sim space)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> / tex2D<float> overloads
+    # resolve.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("step_fire", "colorize_fire"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("step_fire"),
+        "colorize": mod.get_kernel("colorize_fire"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # Both kernels are pixel-parallel over a WIDTH x HEIGHT grid.
+    configs = {"step": config, "colorize": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WINDOW_WIDTH,
+        WINDOW_HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Doom Fire",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad. The texcoord
+    `t` is flipped versus the plasma example so that kernel y=0 lands at the
+    TOP of the screen. That lets the fire kernel keep the intuitive
+    "inject at the largest y, advect upward" convention while the visible
+    flames rise toward the top.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window). Note the
+    # flipped t coordinates compared to gl_interop_plasma: (-1, -1) gets t=1
+    # so screen-bottom samples the kernel's largest-y row.
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 1,
+             1, -1, 1, 1,
+             1,  1, 1, 0,
+            -1, -1, 0, 1,
+             1,  1, 1, 0,
+            -1,  1, 0, 0,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (filled each frame from the PBO).
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # NEAREST upscale: makes the low-res simulation render with crisp,
+    # blocky pixels instead of bilinear-blended mush. Critical to the
+    # Doom-fire look.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heat_arrays():
+    """Allocate two single-channel UINT8 ping-pong Arrays for the heat field.
+
+    Intensity is an integer in [0, 36] indexing the canonical Doom palette.
+    UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1.
+    """
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_heat_texture(arr):
+    """Bind `arr` as a TextureObject configured for POINT + CLAMP reads.
+
+    POINT filtering is what gives Doom fire its chunky retro look. LINEAR
+    smooths the per-frame horizontal jitter into a uniform glow that
+    doesn't read as fire.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # Non-normalized: the step kernel addresses texels in pixel space.
+        normalized_coords=False,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def build_fire_palette():
+    """Return the canonical Doom fire palette as a (37, 4) uint8 array.
+
+    The 37 entries map intensity 0 (black) -> 36 (white). Each entry is
+    indexed by the integer intensity in the heat field.
+
+    Source: Fabien Sanglard's PSX Doom analysis, reproduced in
+    https://github.com/tiagomenegaz/doom-fire.
+    """
+    rgb = [
+        (  7,   7,   7), ( 31,   7,   7), ( 47,  15,   7), ( 71,  15,   7),
+        ( 87,  23,   7), (103,  31,   7), (119,  31,   7), (143,  39,   7),
+        (159,  47,   7), (175,  63,   7), (191,  71,   7), (199,  71,   7),
+        (223,  79,   7), (223,  87,   7), (223,  87,   7), (215,  95,   7),
+        (215,  95,   7), (215, 103,  15), (207, 111,  15), (207, 119,  15),
+        (207, 127,  15), (207, 135,  23), (199, 135,  23), (199, 143,  23),
+        (199, 151,  31), (191, 159,  31), (191, 159,  31), (191, 167,  39),
+        (191, 167,  39), (191, 175,  47), (183, 175,  47), (183, 183,  47),
+        (183, 183,  55), (207, 207, 111), (223, 223, 159), (239, 239, 199),
+        (255, 255, 255),
+    ]
+    # Index 0 (the "no fire" color) is rendered as pure black so dead pixels
+    # don't glow. The canonical (7, 7, 7) reads as a dim background which is
+    # less dramatic against the dark window.
+    rgb[0] = (0, 0, 0)
+    assert len(rgb) == PALETTE_SIZE
+    rgba = np.empty((PALETTE_SIZE, 4), dtype=np.uint8)
+    rgba[:, :3] = np.array(rgb, dtype=np.uint8)
+    rgba[:, 3] = 255
+    return rgba
+
+
+def make_palette_array_and_texture(stream):
+    """Allocate the 1D RGBA8 palette Array, upload, and bind as a texture.
+
+    Returns (palette_array, palette_texture). Both must be closed by the
+    caller (or used inside `with` blocks).
+    """
+    palette = build_fire_palette()  # shape (PALETTE_SIZE, 4), uint8
+    arr = Array.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    # 1D Array bytes match a flat (PALETTE_SIZE * 4) uint8 buffer.
+    arr.copy_from(np.ascontiguousarray(palette), stream=stream)
+
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        # POINT keeps the palette stops as discrete color bands -- the
+        # classic Doom fire palette is indexed, not gradient-blended.
+        filter_mode=FilterMode.POINT,
+        # NORMALIZED_FLOAT: tex1D<float4> returns each UINT8 channel as a
+        # float in [0, 1], so the colorize kernel can multiply by 255 and
+        # store directly without manual unpacking.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        # Normalized: the kernel feeds a heat value in [0, 1] as the LUT
+        # coordinate. With normalized_coords=True the LINEAR filter blends
+        # adjacent palette entries smoothly.
+        normalized_coords=True,
+    )
+    tex = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+    return arr, tex
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate heat-field Arrays, palette Array, and the four
+    #             bindless handles (textures + surfaces). We hold them open
+    #             for the lifetime of the window and release in on_close(),
+    #             matching the reaction-diffusion example. (Using `with`
+    #             blocks here would close everything before the pyglet event
+    #             loop has a chance to use them.)
+    arr_a, arr_b = make_heat_arrays()
+    palette_arr, palette_tex = make_palette_array_and_texture(stream)
+    tex_a = make_heat_texture(arr_a)
+    tex_b = make_heat_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # The heat field is born zeroed by Array.from_descriptor. No seed pass.
+    state = {
+        "current": "a",            # which array holds the latest heat field
+        "frame_index": 0,           # passed into the step kernel as `t`
+        "ambient": True,            # SPACE toggles bottom-row injection
+        "mouse_down": False,
+        "mouse_x": 0,
+        "mouse_y": 0,
+    }
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    def clear_field():
+        """Zero both heat arrays and seed the bottom row at full intensity.
+
+        Array.copy_from is the simplest reset path -- a dedicated clear
+        kernel would be faster but is unnecessary for an interactive demo.
+        The bottom row is set to MAX_INTENSITY so the very first frame
+        already has a fire source to advect from.
+        """
+        seed = np.zeros((HEIGHT, WIDTH), dtype=np.uint8)
+        seed[HEIGHT - 1, :] = MAX_INTENSITY  # canonical Doom fire source
+        arr_a.copy_from(np.ascontiguousarray(seed), stream=stream)
+        arr_b.copy_from(np.ascontiguousarray(seed), stream=stream)
+        state["current"] = "a"
+
+    # Seed at startup so frame 1 already has a source row.
+    clear_field()
+    stream.sync()
+
+    # --- Step 7: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.SPACE:
+            state["ambient"] = not state["ambient"]
+            return
+        if symbol == key.R:
+            clear_field()
+            return
+
+    # Map window coords (WINDOW_WIDTH x WINDOW_HEIGHT, y=0 at bottom) to
+    # simulation coords (WIDTH x HEIGHT, y=0 at top).
+    def _window_to_sim(x, y):
+        sx = int(x * WIDTH / WINDOW_WIDTH)
+        sy = int((WINDOW_HEIGHT - 1 - y) * HEIGHT / WINDOW_HEIGHT)
+        return sx, sy
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_mouse_release(_x, _y, _button, _modifiers):
+        state["mouse_down"] = False
+
+    @window.event
+    def on_mouse_drag(x, y, _dx, _dy, _buttons, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Advance the heat field by one step.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.uint32(state["frame_index"]),
+            np.int32(state["mouse_x"]),
+            np.int32(state["mouse_y"]),
+            np.int32(1 if state["mouse_down"] else 0),
+            np.int32(1 if state["ambient"] else 0),
+        )
+        state["current"] = next_current
+        state["frame_index"] += 1
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_heat = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_heat.handle),
+                np.uint64(palette_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            ambient_label = "on" if state["ambient"] else "off"
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Doom Fire"
+                f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" ambient {ambient_label})"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly to be deterministic about ordering.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        palette_tex.close()
+        palette_arr.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains two CUDA C++ kernels:
+#       * step_fire     -- advances the heat field. Reads previous state via a
+#                          TextureObject (LINEAR + CLAMP, non-normalized) and
+#                          writes the next state via a SurfaceObject. Bakes
+#                          the bottom-row injection, mouse torch, and upward
+#                          jittered advection into a single pass.
+#       * colorize_fire -- per pixel: read heat from the heat TextureObject,
+#                          look up the fire palette via tex1D<float4>, write
+#                          RGBA bytes to the OpenGL PBO.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. The quad's t
+#     coordinate is flipped versus the plasma example so that y=0 maps to the
+#     top of the screen (see create_display_resources for why).
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Small, deterministic, GPU-friendly hash. Returns a value in [0, 1).
+// Used both for bottom-row ember intensity and for the per-pixel jitter that
+// gives the fire its characteristic horizontal flicker.
+__device__ __forceinline__ float hash3(unsigned int x, unsigned int y,
+                                       unsigned int t) {
+    unsigned int h = x * 374761393u + y * 668265263u + t * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h ^= (h >> 16);
+    return (float)(h & 0x00ffffffu) / (float)0x01000000u;
+}
+
+// Canonical Doom-fire step (gather form of the original scatter algorithm).
+//
+// Reference scatter (one cell per JS source row):
+//     decay = random in {0, 1, 2}
+//     below = state[x, y+1]
+//     new = max(0, below - decay)
+//     state[x - decay, y] = new        // writes LEFT of source -> leftward lean
+//
+// Equivalent gather (one CUDA thread per destination cell):
+//     decay = hash(x, y, t) in {0, 1, 2}
+//     below = state[x + decay, y+1]    // reads from the right-shifted source
+//     new = max(0, below - decay)
+//     state[x, y] = new
+//
+// The right-shifted gather reads the same data the leftward-shifted scatter
+// would have produced.
+
+extern "C"
+__global__
+void step_fire(cudaTextureObject_t tex_read,
+               cudaSurfaceObject_t surf_write,
+               int width, int height,
+               unsigned int t,
+               int mouse_x, int mouse_y, int mouse_active,
+               int ambient_on) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    const int MAX_I = 36;
+
+    // 1) Mouse torch: a hot disc painted at the cursor (overrides everything).
+    if (mouse_active) {
+        int dx = x - mouse_x;
+        int dy = y - mouse_y;
+        if (dx * dx + dy * dy <= 12 * 12) {  // matches host TORCH_RADIUS
+            surf2Dwrite((unsigned char)MAX_I, surf_write, x, y);
+            return;
+        }
+    }
+
+    // 2) Bottom row is the steady fire source. Hardcoded to MAX_I when the
+    //    ambient ember bed is on; zero otherwise (lets the fire die down).
+    if (y == height - 1) {
+        surf2Dwrite((unsigned char)(ambient_on ? MAX_I : 0),
+                    surf_write, x, y);
+        return;
+    }
+
+    // 3) Gather from the row below with random {0, 1, 2} horizontal shift
+    //    and matching intensity decay -- the canonical Doom-fire update.
+    float jitter_h = hash3((unsigned int)x, (unsigned int)y, t);
+    int decay = (int)(jitter_h * 3.0f);             // 0, 1, or 2
+    int src_x = x + decay;
+    if (src_x >= width) src_x = width - 1;
+    unsigned char below = tex2D<unsigned char>(tex_read,
+                                               (float)src_x + 0.5f,
+                                               (float)y + 1.5f);
+    int new_i = (int)below - decay;
+    if (new_i < 0) new_i = 0;
+
+    // UINT8 is 1 byte, so surf2Dwrite's x argument is already the byte offset.
+    surf2Dwrite((unsigned char)new_i, surf_write, x, y);
+}
+
+extern "C"
+__global__
+void colorize_fire(cudaTextureObject_t tex_heat,
+                   cudaTextureObject_t palette_tex,
+                   unsigned char* output,
+                   int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Heat texture is UINT8 + ELEMENT_TYPE: tex2D<unsigned char> returns the
+    // raw intensity byte (0..36).
+    unsigned char h = tex2D<unsigned char>(tex_heat,
+                                           (float)x + 0.5f,
+                                           (float)y + 0.5f);
+
+    // Palette texture is 1D normalized RGBA8 with POINT filtering and 37
+    // entries. Index i lands at coord (i + 0.5) / 37 -- the texel center,
+    // which POINT samples exactly.
+    const float palette_size = 37.0f;
+    float u = ((float)h + 0.5f) / palette_size;
+    float4 c = tex1D<float4>(palette_tex, u);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(c.x * 255.0f);
+    output[idx + 1] = (unsigned char)(c.y * 255.0f);
+    output[idx + 2] = (unsigned char)(c.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py
new file mode 100644
index 00000000000..4bdd55e1569
--- /dev/null
+++ b/cuda_core/examples/gl_interop_image_show.py
@@ -0,0 +1,428 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# Minimal "Hello World" for the cuda.core texture/surface stack.
+#
+# Allocates a small `Array`, fills it with a procedural image once, binds it
+# as a `TextureObject`, and uses a single CUDA kernel to sample that texture
+# at every screen pixel (with a scale + rotation transform) and write the
+# result into an OpenGL PBO for display.
+#
+# Nothing else: no `SurfaceObject`, no ping-pong, no simulation, no mipmaps.
+# If you have never touched the new APIs before, open this file first.
+#
+# ################################################################################
+#
+# What this example teaches
+# =========================
+# - Allocate an `Array` and upload data into it with `Array.copy_from`.
+# - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`.
+# - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR`
+#   (press F to toggle live).
+# - That filter mode is baked into the `TextureDescriptor` at creation time,
+#   so changing it requires destroying and rebuilding the `TextureObject`.
+#
+# How it works
+# ============
+#   Startup (once):
+#     +-------------------+   copy_from   +----------+
+#     | host numpy image  | ------------> |  Array   |  (UINT8 RGBA, 64x64)
+#     +-------------------+               +----+-----+
+#                                              |
+#                                              v
+#                                       +-------------+
+#                                       | TextureObj  |  (filter mode = POINT)
+#                                       +-------------+
+#
+#   Each frame:
+#     - kernel `sample_image` reads from the TextureObject at a transformed
+#       (u, v) per screen pixel and writes RGBA bytes to the GL PBO.
+#     - OpenGL copies the PBO into a screen texture and draws it.
+#
+# What you should see
+# ===================
+# A 64x64 procedural test pattern (checkerboard + colored gradient stripes +
+# diagonal lines) magnified to fill the window. Press F to switch between
+# POINT (blocky) and LINEAR (smooth) sampling; the difference is immediately
+# visible. Press R to start/stop a slow rotation. Esc to quit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+WIDTH = 640
+HEIGHT = 480
+IMAGE_SIZE = 64  # the source Array is IMAGE_SIZE x IMAGE_SIZE RGBA8
+
+
+# ============================= Helper functions =============================
+
+
+def make_test_image(size):
+    """Build a (size, size, 4) uint8 RGBA test pattern.
+
+    Designed so the filter-mode difference is obvious: hard-edged checkerboard
+    (POINT preserves the edges; LINEAR smooths them) plus a vertical color
+    gradient stripe (LINEAR blends smoothly between palette stops) plus two
+    diagonal hairlines (POINT preserves them; LINEAR softens them).
+    """
+    img = np.zeros((size, size, 4), dtype=np.uint8)
+    # 8x8 black/white checkerboard
+    cells = size // 8
+    for y in range(size):
+        for x in range(size):
+            if ((x // cells) + (y // cells)) & 1:
+                img[y, x, :3] = 255
+    # vertical RGB gradient strip down the left third
+    strip = size // 3
+    img[:, :strip, 0] = np.linspace(255, 0, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 1] = np.linspace(0, 255, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 2] = 128
+    # two diagonal red hairlines
+    for d in range(size):
+        img[d, d, :] = [255, 0, 0, 255]
+        if d < size - 4:
+            img[d, d + 4, :] = [255, 0, 0, 255]
+    img[:, :, 3] = 255  # opaque
+    return img
+
+
+def setup_cuda():
+    """Compile the kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("sample_image",))
+    kernel = mod.get_kernel("sample_image")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window. Returns (window, gl_module, pyglet_module)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array + TextureObject - Image Show",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard pyglet boilerplate: shader, fullscreen quad, screen texture."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create the GL PBO that CUDA writes RGBA pixels into each frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_texture(arr, filter_mode):
+    """Build a `TextureObject` for `arr` with the given FilterMode.
+
+    Filter mode is baked into the descriptor at creation; to switch modes
+    we close this object and call this helper again.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=filter_mode,
+        # UINT8 source + NORMALIZED_FLOAT means tex2D<float4> returns each
+        # channel as a float in [0, 1] -- handy for the colorize math below.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) ---
+    shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the PBO that CUDA will write into ---
+    pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Allocate the source `Array` and upload the test pattern ---
+    arr = Array.from_descriptor(
+        shape=(IMAGE_SIZE, IMAGE_SIZE),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    host_image = make_test_image(IMAGE_SIZE)
+    arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
+    stream.sync()
+
+    # --- Step 6: Bind the Array as a TextureObject (initially POINT) ---
+    state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0}
+    tex = make_texture(arr, state["filter"])
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        nonlocal tex
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.F:
+            # Filter mode is baked at TextureObject creation time. Swapping
+            # it means closing the old one and building a new one.
+            state["filter"] = (
+                FilterMode.LINEAR if state["filter"] == FilterMode.POINT
+                else FilterMode.POINT
+            )
+            tex.close()
+            tex = make_texture(arr, state["filter"])
+        elif symbol == key.R:
+            state["rotate"] = not state["rotate"]
+
+    # --- Step 7: Render loop ---
+    start = time.monotonic()
+    last_t = start
+    frame_count = 0
+    fps_time = start
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time, last_t
+        now = time.monotonic()
+        if state["rotate"]:
+            state["angle"] += (now - last_t) * 0.5  # rad/sec
+        last_t = now
+
+        window.clear()
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(state["angle"]),
+            )
+        copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex)
+
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                f"cuda.core Array + TextureObject - Image Show "
+                f"(filter={state['filter'].name}, "
+                f"rotate={'on' if state['rotate'] else 'off'}, "
+                f"{fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        tex.close()
+        arr.close()
+        resource.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ============================== GPU code (kernel) ============================
+
+KERNEL_SOURCE = r"""
+extern "C"
+__global__
+void sample_image(cudaTextureObject_t tex,
+                  unsigned char* output,
+                  int width, int height,
+                  float angle) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Center the screen pixel around (0, 0) in [-aspect, aspect] x [-1, 1].
+    float aspect = (float)width / (float)height;
+    float sx = ((float)x / (float)width  - 0.5f) * 2.0f * aspect;
+    float sy = ((float)y / (float)height - 0.5f) * 2.0f;
+
+    // Inverse-rotate the screen point: rotating the image by +angle means
+    // each output pixel reads from the source rotated by -angle.
+    float c = cosf(-angle), s = sinf(-angle);
+    float rx = c * sx - s * sy;
+    float ry = s * sx + c * sy;
+
+    // Map rotated screen point to the [0, 1] x [0, 1] texture domain so the
+    // image (drawn centered, fitting ~75% of the window height) lands on it.
+    const float scale = 0.75f;
+    float u = (rx / (2.0f * scale)) + 0.5f;
+    float v = (ry / (2.0f * scale)) + 0.5f;
+
+    // AddressMode.CLAMP means out-of-range u/v sample the edge texel.
+    float4 col = tex2D<float4>(tex, u, v);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(col.x * 255.0f);
+    output[idx + 1] = (unsigned char)(col.y * 255.0f);
+    output[idx + 2] = (unsigned char)(col.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
new file mode 100644
index 00000000000..c1772514a70
--- /dev/null
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -0,0 +1,805 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia
+# continuous cellular automaton is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through
+# a large bell-shaped neighborhood kernel, and a SurfaceObject provides typed
+# writes. The final state is colorized straight into an OpenGL PBO. Requires
+# pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to drive a wide-radius convolution from a TextureObject configured for
+#   LINEAR + WRAP + normalized coordinates. The same Array is then bound as a
+#   SurfaceObject for the typed write back, requiring `surface_load_store=True`
+#   at allocation time.
+# - How a single-channel `float` Array differs from the multi-channel layout
+#   used in the Gray-Scott example: `num_channels=1`, `tex2D<float>` reads, and
+#   a 4-byte x-stride in `surf2Dwrite`.
+# - How to host-precompute a normalization constant for a stencil with a
+#   variable-shape support (the bell-curve neighborhood), then pass it as a
+#   plain float kernel argument.
+#
+# How it works
+# ============
+# Lenia (Bert Wang-Chak Chan, 2018) generalizes Conway's Game of Life to
+# continuous space, time, and state. Each cell holds a real value in [0, 1].
+# Per step, every cell:
+#
+#   1. Integrates a smooth bell-shaped neighborhood kernel K against the
+#      current state to produce a "potential" U:
+#
+#          U(x) = sum over offsets (dx, dy) inside a disk of radius R of
+#                  K(|(dx, dy)|) * state(x + (dx, dy))
+#                 divided by  sum of K  (host-precomputed).
+#
+#      K(r) = exp(-((r / R) - mu_K)^2 / (2 * sigma_K^2)) for r <= R.
+#
+#   2. Applies the growth function G and updates the state:
+#
+#          state_new = clamp(state_old + dt * (2 * exp(-(U - mu)^2 /
+#                            (2 * sigma^2)) - 1),  0,  1).
+#
+# Two single-channel `float` arrays are ping-ponged each frame: a
+# TextureObject reads one (sampled with LINEAR + WRAP so the disk wraps
+# toroidally) and a SurfaceObject writes the other.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float>    +------------------+
+#   |   arr_a      | ----------------> |                  |
+#   |    state     |                   |  convolve_lenia  |
+#   +--------------+                   |     kernel       |
+#                                      |  (+ growth fn)   |
+#   +--------------+   surf2Dwrite     |                  |
+#   |   arr_b      | <---------------- |                  |
+#   |    state     |                   +------------------+
+#   +--------------+
+#       (swap)
+#
+# After the step we run a separate `colorize_lenia` kernel that samples the
+# new state and writes RGBA bytes straight into the OpenGL PBO via
+# GraphicsResource. No data ever travels across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Lenia's neighborhood radius (R = 13) is wide enough that boundary handling
+# really matters. AddressMode.WRAP gives a toroidal world for free, and it is
+# only supported in normalized coordinate mode (see the CUDA Programming
+# Guide). LINEAR filtering is essentially free on the hardware -- here it
+# softens the integer-offset reads a hair, which keeps the dynamics smooth.
+# Sample coordinates are `(x + dx + 0.5) / W`; values < 0 or > 1 are fine,
+# WRAP handles them.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# single-channel `float` surface that means `x * sizeof(float)` = `x * 4`.
+# (The Gray-Scott example uses 8 because it stores `float2`.)
+#
+# One step per frame
+# ------------------
+# Each step convolves a (2R+1)^2 = 729-tap neighborhood for every pixel, which
+# is much heavier than a Gray-Scott 5-point Laplacian. With dt = 0.1 the
+# dynamics are slow enough that one step per displayed frame is plenty. There
+# is no `N_STEPS` loop.
+#
+# What you should see
+# ===================
+# A window showing soft, glider-like blobs drifting across the field on a
+# teal-on-black palette. Press R to reseed with a new Gaussian blob, 1 to
+# clear the field, and Escape to exit. The window title shows the current
+# FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 256
+HEIGHT = 256
+
+# Neighborhood / kernel shape
+R = 13  # convolution radius in pixels (texture-space)
+MU_K = 0.5  # bell center for the neighborhood weight K(r/R)
+SIGMA_K = 0.15  # bell width for K
+
+# Growth function shape
+MU = 0.15  # bell center for the growth function G(U)
+SIGMA = 0.015  # bell width for G
+
+DT = 0.1  # time step
+
+# Initial blob radius and peak for the Gaussian seed.
+# The radius must be large relative to the neighborhood radius R=13 so the
+# kernel-integrated potential U lands near the growth bell's center mu=0.15.
+# With SEED_RADIUS=36, U at the blob's centre starts near mu and the field
+# survives the first step; smaller seeds collapse to zero within one frame
+# because U is far outside the narrow (sigma=0.015) growth bell.
+SEED_RADIUS = 36.0
+SEED_PEAK = 0.5
+
+# Seed modes (kept in sync with the seed_blob kernel)
+SEED_MODE_CLEAR = 0
+SEED_MODE_BLOB = 1
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def compute_kernel_norm(radius, mu_k, sigma_k):
+    """Precompute 1 / (sum of K(r)) for the bell-shaped neighborhood weight.
+
+    Mirrors exactly what the device kernel does so the convolution is energy-
+    preserving: walks the (2R+1)x(2R+1) box, accumulates
+    `exp(-(r/R - mu_k)^2 / (2*sigma_k^2))` for `r <= R`, and returns the
+    reciprocal sum as a float32.
+    """
+    inv_two_sigma2 = 1.0 / (2.0 * sigma_k * sigma_k)
+    inv_r = 1.0 / float(radius)
+    total = 0.0
+    for dy in range(-radius, radius + 1):
+        for dx in range(-radius, radius + 1):
+            r = math.sqrt(dx * dx + dy * dy)
+            if r > radius:
+                continue
+            rn = r * inv_r - mu_k
+            total += math.exp(-(rn * rn) * inv_two_sigma2)
+    if total <= 0.0:
+        raise RuntimeError("kernel normalization sum collapsed to zero")
+    return np.float32(1.0 / total)
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("convolve_lenia", "colorize_lenia", "seed_blob"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("convolve_lenia"),
+        "colorize": mod.get_kernel("colorize_lenia"),
+        "seed": mod.get_kernel("seed_blob"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Lenia",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two single-channel `float` ping-pong arrays.
+
+    `surface_load_store=True` is what lets the same Array be bound as both a
+    TextureObject (sampled reads) and a SurfaceObject (typed writes).
+    """
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, mode, seed_value):
+    """Re-initialize the array behind `write_surf` with a Gaussian blob or zeros.
+
+    `mode = SEED_MODE_CLEAR` zeroes the field; `mode = SEED_MODE_BLOB` places a
+    Gaussian blob with peak ~SEED_PEAK at the center, jittered by `seed_value`
+    so successive reseeds give different patterns.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.int32(mode),
+        np.uint32(seed_value),
+        np.float32(SEED_RADIUS),
+        np.float32(SEED_PEAK),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are single-channel `float` with `surface_load_store=True` so
+    #     they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Creating these once is much cheaper than rebuilding them every
+    #     step. The simulation loop just picks which read/write pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Precompute the bell-curve normalization constant ---
+    #     The neighborhood weight K(r) is unnormalized in the kernel; we
+    #     divide by sum(K) so the convolution is a weighted mean rather than
+    #     an unbounded integral. Doing this on the host once at startup is
+    #     much cheaper than redoing it on the device every step.
+    inv_weight_sum = compute_kernel_norm(R, MU_K, SIGMA_K)
+
+    # --- Step 9: Seed an initial Gaussian blob into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "seed": 0}
+
+    # --- Step 10: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            # Reseed with a new Gaussian blob; bump the seed so the jitter
+            # pattern changes each time.
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, state["seed"])
+            state["current"] = "a"
+            return
+        if symbol == key._1:
+            # Clear the field. Useful to confirm the simulation is quiet when
+            # the state is zero.
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_CLEAR, 0)
+            state["current"] = "a"
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Run one Lenia step. The convolution kernel reads the current
+        #     state via a TextureObject (LINEAR + WRAP gives toroidal
+        #     wrapping at the border), evaluates the growth function, and
+        #     writes the new state via a SurfaceObject. One step per frame
+        #     is intentional: dt = 0.1 is small, and the (2R+1)^2 = 729-tap
+        #     stencil is heavy enough that going faster would not help.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(R),
+            np.float32(MU_K),
+            np.float32(SIGMA_K),
+            np.float32(MU),
+            np.float32(SIGMA),
+            np.float32(DT),
+            inv_weight_sum,
+        )
+        state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Lenia"
+                f" ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_blob       -- sets the initial state via SurfaceObject writes.
+#                            Either clears the field (mode = 0) or paints a
+#                            Gaussian blob centered in the field (mode = 1).
+#       * convolve_lenia  -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering), integrates a
+#                            bell-shaped neighborhood K(r/R) to produce the
+#                            potential U, applies the growth function G(U),
+#                            and writes the next state via SurfaceObject.
+#       * colorize_lenia  -- reads the new state via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            teal-on-black gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// All kernels run one thread per output pixel and bounds-check at the top.
+// `surf2Dwrite` takes the x offset in BYTES; for a single-channel float
+// surface that means `x * sizeof(float)` = `x * 4`.
+
+extern "C"
+__global__
+void seed_blob(cudaSurfaceObject_t surf,
+               int width, int height,
+               int mode,
+               unsigned int seed,
+               float radius,
+               float peak) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float value = 0.0f;
+    if (mode == 1) {
+        // Gaussian blob centered in the field with a small deterministic
+        // jitter that breaks symmetry differently on each reseed.
+        float cx = (float)(width  / 2);
+        float cy = (float)(height / 2);
+        float dx = (float)x - cx;
+        float dy = (float)y - cy;
+        float r2 = dx * dx + dy * dy;
+        float inv = 1.0f / (radius * radius);
+        value = peak * expf(-r2 * inv);
+
+        unsigned int h = (unsigned int)x * 374761393u +
+                         (unsigned int)y * 668265263u + seed * 2246822519u;
+        h = (h ^ (h >> 13)) * 1274126177u;
+        h = h ^ (h >> 16);
+        float noise = (h & 0xffffu) / 65535.0f;  // in [0, 1]
+        value += 0.02f * (noise - 0.5f);
+        if (value < 0.0f) value = 0.0f;
+        if (value > 1.0f) value = 1.0f;
+    }
+
+    // float is 4 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(value, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void convolve_lenia(cudaTextureObject_t tex,
+                    cudaSurfaceObject_t surf,
+                    int width, int height,
+                    int R,
+                    float mu_k, float sigma_k,
+                    float mu, float sigma,
+                    float dt,
+                    float inv_weight_sum) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized texture coordinates: WRAP addressing requires them. The
+    // (x + dx + 0.5) / W idiom places the sample at the texel center; values
+    // outside [0, 1] are fine because WRAP wraps them toroidally.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float inv_R = 1.0f / (float)R;
+    float inv_two_sigma_k2 = 1.0f / (2.0f * sigma_k * sigma_k);
+    float inv_two_sigma2   = 1.0f / (2.0f * sigma     * sigma);
+
+    // Integrate the bell-shaped weight K(r/R) against the current state.
+    float U = 0.0f;
+    for (int dy = -R; dy <= R; ++dy) {
+        for (int dx = -R; dx <= R; ++dx) {
+            float fdx = (float)dx;
+            float fdy = (float)dy;
+            float r2 = fdx * fdx + fdy * fdy;
+            float r  = sqrtf(r2);
+            if (r > (float)R) continue;   // restrict to the disk
+            float rn = r * inv_R - mu_k;
+            float w  = expf(-(rn * rn) * inv_two_sigma_k2);
+
+            float sx = ((float)x + fdx + 0.5f) * inv_w;
+            float sy = ((float)y + fdy + 0.5f) * inv_h;
+            float s  = tex2D<float>(tex, sx, sy);
+            U += w * s;
+        }
+    }
+    U *= inv_weight_sum;   // host-precomputed 1 / sum(K)
+
+    // Read the current cell value (point sample at the texel center).
+    float sx0 = ((float)x + 0.5f) * inv_w;
+    float sy0 = ((float)y + 0.5f) * inv_h;
+    float state = tex2D<float>(tex, sx0, sy0);
+
+    // Growth function G(U) = 2 * exp(-(U - mu)^2 / (2 * sigma^2)) - 1,
+    // mapping U near mu to +1 (grow) and U far from mu to -1 (shrink).
+    float du = U - mu;
+    float G  = 2.0f * expf(-(du * du) * inv_two_sigma2) - 1.0f;
+
+    float new_state = state + dt * G;
+    if (new_state < 0.0f) new_state = 0.0f;
+    if (new_state > 1.0f) new_state = 1.0f;
+
+    surf2Dwrite(new_state, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void colorize_lenia(cudaTextureObject_t tex,
+                    unsigned char* output,
+                    int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = ((float)x + 0.5f) * inv_w;
+    float cy = ((float)y + 0.5f) * inv_h;
+
+    float v = tex2D<float>(tex, cx, cy);
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Linear interpolation from a deep teal at v = 0 to a bright teal at
+    // v = 1. Two stops -- simple, easy to read, no LUT required.
+    //   (0, 15, 30, 255)  ->  (50, 200, 180, 255)
+    float r = (  0.0f + v * ( 50.0f -   0.0f));
+    float g = ( 15.0f + v * (200.0f -  15.0f));
+    float b = ( 30.0f + v * (180.0f -  30.0f));
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)r;
+    output[idx + 1] = (unsigned char)g;
+    output[idx + 2] = (unsigned char)b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py
new file mode 100644
index 00000000000..11abca54c22
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mandelbrot.py
@@ -0,0 +1,692 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array and TextureObject used as a *color
+# lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer.
+# A CUDA kernel computes smooth iteration counts and uses tex1D<float4> with
+# LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette,
+# writing the final RGBA bytes straight into an OpenGL PBO via GraphicsResource.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a 1D cuda.core.Array as a palette and bind it via a
+#   TextureObject for hardware-filtered color lookups inside a kernel.
+# - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized
+#   coordinates give you a free `texture(palette, t)` style sampler that
+#   returns a float4 in [0, 1] regardless of the underlying storage format.
+# - How to drive a real-time interactive viewer: mouse pan, scroll-wheel zoom
+#   anchored at the cursor, and key-driven iteration cap.
+#
+# How it works
+# ============
+# The Mandelbrot set is defined by iterating z -> z^2 + c starting from
+# z = 0; pixels are colored by how quickly z escapes the disk of radius 2.
+#
+#     +---------+   ResourceDescriptor.from_array
+#     |  Array  | --------------------------------+
+#     | float4  |                                 v
+#     | size 256|                       +-------------------+
+#     +---------+                       |   TextureObject   |
+#       ^  copy_from(host)              |  (palette LUT)    |
+#       |                               +---------+---------+
+#     host palette                                |
+#     (numpy float32x4, 256 stops)                |
+#                                                 v
+#                                  tex1D<float4>(palette, t)
+#                                                 |
+#                                                 v
+#                                     +-----------------------+
+#                                     |  mandelbrot kernel    |
+#                                     |  (one thread / pixel) |
+#                                     +-----------+-----------+
+#                                                 |
+#                                                 v   GraphicsResource.map
+#                                     +-----------------------+
+#                                     |   OpenGL PBO (RGBA8)  |
+#                                     +-----------------------+
+#
+# Smooth iteration count
+# ----------------------
+# A plain integer escape count produces ugly banded colors. With a bailout
+# radius R = 2 (escape when |z|^2 > 4), we use the standard smooth formula:
+#
+#     mu = iter + 1 - log(log(|z|)) / log(2)
+#
+# At the escape step |z| > 2, so log(|z|) > log(2) > 0 and log(log(|z|)) is
+# finite. We compute this in double and cast to float for the palette lookup.
+#
+# Cursor-anchored zoom
+# --------------------
+# On scroll, we want the world point under the mouse cursor to remain under
+# the cursor after the zoom. We capture (wx, wy) under the cursor with the
+# old scale, multiply the scale by 0.9 (zoom in) or 1.1 (zoom out), then
+# back-solve cx, cy so the same screen pixel still maps to (wx, wy):
+#
+#     cx_new = wx - (mouse_x - W/2) * scale_new
+#     cy_new = wy - (mouse_y - H/2) * scale_new
+#
+# Why double precision for cx, cy, scale?
+# ---------------------------------------
+# Float32 runs out of mantissa bits around 1e6x zoom; double gets you to
+# roughly 1e13x before the pixel grid coarsens visibly. The kernel takes
+# cx, cy, scale as doubles and only narrows to float for the color lookup.
+#
+# Address mode note
+# -----------------
+# We use AddressMode.CLAMP (per the example brief). Combined with the
+# `fmodf(mu * 0.02f, 1.0f)` cycling formula, the palette index is already
+# guaranteed to be in [0, 1), so CLAMP and WRAP both produce identical
+# results in practice -- there is no visible seam.
+#
+# What you should see
+# ===================
+# A window showing the Mandelbrot set. Drag with the left mouse button to
+# pan, scroll the wheel to zoom in/out at the cursor, press R to reset the
+# view, and `[`/`]` to lower/raise the iteration cap. The window title shows
+# the current zoom level, center, max_iter, and FPS. Close the window or
+# press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and viewer parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+PALETTE_SIZE = 256
+
+# Default view: classic Mandelbrot framing centered slightly left of origin.
+DEFAULT_CX = -0.5
+DEFAULT_CY = 0.0
+DEFAULT_SCALE = 4.0 / HEIGHT  # world-units per pixel (4-unit-tall view)
+DEFAULT_MAX_ITER = 512
+
+# Bounds for [/] iteration adjust.
+MIN_MAX_ITER = 64
+MAX_MAX_ITER = 8192
+ITER_STEP = 64
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject as a palette LUT, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernel and return (device, stream, kernel, config)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # Bindless texture objects (cuTexObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless texture objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("mandelbrot",))
+
+    kernel = mod.get_kernel("mandelbrot")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture - Mandelbrot Deep Zoom",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def build_palette():
+    """Build a 256-entry RGBA float32 palette by lerping through color stops.
+
+    Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32
+    suitable for Array.copy_from(). Each color channel is in [0, 1].
+    """
+    # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red ->
+    # magenta -> black (the final stop is used by points that hit max_iter
+    # and don't escape).
+    stops = np.array(
+        [
+            [0.00, 0.02, 0.05, 0.30, 1.0],  # position, R, G, B, A
+            [0.16, 0.10, 0.50, 0.90, 1.0],  # cyan
+            [0.42, 1.00, 0.95, 0.20, 1.0],  # yellow
+            [0.58, 1.00, 0.55, 0.10, 1.0],  # orange
+            [0.74, 0.95, 0.10, 0.10, 1.0],  # red
+            [0.90, 0.65, 0.10, 0.85, 1.0],  # magenta
+            [1.00, 0.00, 0.00, 0.00, 1.0],  # black
+        ],
+        dtype=np.float32,
+    )
+
+    pal = np.empty((PALETTE_SIZE, 4), dtype=np.float32)
+    positions = stops[:, 0]
+    colors = stops[:, 1:]
+    for i in range(PALETTE_SIZE):
+        t = i / (PALETTE_SIZE - 1)
+        # Find the bracketing segment.
+        j = int(np.searchsorted(positions, t, side="right")) - 1
+        j = max(0, min(j, len(positions) - 2))
+        t0 = positions[j]
+        t1 = positions[j + 1]
+        seg = (t - t0) / (t1 - t0) if t1 > t0 else 0.0
+        pal[i] = colors[j] + seg * (colors[j + 1] - colors[j])
+
+    # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a
+    # float4 x PALETTE_SIZE 1D Array.
+    return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32)
+
+
+def make_palette_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        # NORMALIZED_FLOAT is a no-op for FLOAT32 storage (the data is already
+        # in [0, 1]); we set it because the spec calls for it and to document
+        # the intent for readers building palettes from UINT8 storage.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Build and upload the palette LUT ---
+    #     One 1D Array, 256 entries of float4 RGBA. The host-side palette is
+    #     a flat numpy float32 array; copy_from() does an async H2D copy, so
+    #     we sync the stream once afterwards to make sure the data has landed
+    #     before we start sampling from it in the render loop.
+    host_palette = build_palette()
+    palette_arr = Array.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+    )
+    palette_arr.copy_from(host_palette, stream=stream)
+    stream.sync()
+
+    # --- Step 7: Bind the palette Array as a TextureObject (LUT) ---
+    palette_tex = make_palette_texture(palette_arr)
+
+    # --- Step 8: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    # View state. cx, cy, scale are kept in Python floats (double precision)
+    # and converted to np.float64 on each kernel launch.
+    view = {
+        "cx": float(DEFAULT_CX),
+        "cy": float(DEFAULT_CY),
+        "scale": float(DEFAULT_SCALE),
+        "max_iter": int(DEFAULT_MAX_ITER),
+        # Pan-drag state (left mouse button).
+        "dragging": False,
+    }
+
+    def screen_to_world(mouse_x, mouse_y):
+        """Map a pyglet mouse coordinate to the world point currently under it.
+
+        Pyglet's window origin is bottom-left and the rendered texture's
+        origin is also bottom-left, so no y-flip is needed.
+        """
+        wx = view["cx"] + (mouse_x - WIDTH / 2.0) * view["scale"]
+        wy = view["cy"] + (mouse_y - HEIGHT / 2.0) * view["scale"]
+        return wx, wy
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            view["cx"] = float(DEFAULT_CX)
+            view["cy"] = float(DEFAULT_CY)
+            view["scale"] = float(DEFAULT_SCALE)
+            view["max_iter"] = int(DEFAULT_MAX_ITER)
+            return
+        if symbol == key.BRACKETLEFT:
+            view["max_iter"] = max(MIN_MAX_ITER, view["max_iter"] - ITER_STEP)
+            return
+        if symbol == key.BRACKETRIGHT:
+            view["max_iter"] = min(MAX_MAX_ITER, view["max_iter"] + ITER_STEP)
+            return
+
+    @window.event
+    def on_mouse_press(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = True
+
+    @window.event
+    def on_mouse_release(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = False
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        if buttons & pyglet.window.mouse.LEFT:
+            # Pan: move the center opposite to the cursor drag (so the scene
+            # follows the cursor). dy is positive when moving up in pyglet's
+            # bottom-left origin space, matching the texture orientation.
+            view["cx"] -= dx * view["scale"]
+            view["cy"] += dy * view["scale"]
+
+    @window.event
+    def on_mouse_scroll(x, y, _scroll_x, scroll_y):
+        # Cursor-anchored zoom: keep the world point under the cursor pinned.
+        wx, wy = screen_to_world(x, y)
+        factor = 0.9 if scroll_y > 0 else 1.1
+        view["scale"] *= factor
+        # Back-solve cx, cy so screen pixel (x, y) still maps to (wx, wy).
+        view["cx"] = wx - (x - WIDTH / 2.0) * view["scale"]
+        view["cy"] = wy - (y - HEIGHT / 2.0) * view["scale"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write to it. This gives us a Buffer
+        #     whose .handle is a device pointer pointing into the GL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(palette_tex.handle),  # bindless texture handle
+                buf.handle,                     # output PBO (RGBA8)
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float64(view["cx"]),
+                np.float64(view["cy"]),
+                np.float64(view["scale"]),
+                np.int32(view["max_iter"]),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (b) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (c) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0
+            window.set_caption(
+                "cuda.core Array/Texture - Mandelbrot"
+                f" | zoom {zoom:.3e}x"
+                f" | center ({view['cx']:.6f}, {view['cy']:.6f})"
+                f" | iter {view['max_iter']}"
+                f" | {fps:.0f} FPS"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        palette_tex.close()
+        palette_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE is a single CUDA C++ kernel `mandelbrot` that computes a
+#     smooth iteration count per pixel and looks up the color via
+#     tex1D<float4>(palette, t). Coordinates and the scale factor are doubles
+#     to support deep zooms; only the color lookup runs in single precision.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Mandelbrot deep-zoom kernel with a TextureObject palette LUT.
+//
+// Each thread computes one pixel. Coordinates and scale are doubles so the
+// zoom doesn't quantize at modest depth. Once we have the smooth iteration
+// count we narrow to float and use tex1D<float4> to read the palette.
+
+extern "C"
+__global__
+void mandelbrot(cudaTextureObject_t palette,
+                unsigned char* output,
+                int width, int height,
+                double cx, double cy, double scale,
+                int max_iter) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Map pixel -> complex plane (doubles).
+    double c_re = cx + ((double)x - 0.5 * (double)width)  * scale;
+    double c_im = cy + ((double)y - 0.5 * (double)height) * scale;
+
+    // Standard escape iteration with bailout radius 2 (compare squared norm
+    // against 4 to skip the sqrt in the inner loop).
+    double zr = 0.0;
+    double zi = 0.0;
+    double zr2 = 0.0;
+    double zi2 = 0.0;
+    int iter = 0;
+    while (iter < max_iter && (zr2 + zi2) <= 4.0) {
+        zi = 2.0 * zr * zi + c_im;
+        zr = zr2 - zi2 + c_re;
+        zr2 = zr * zr;
+        zi2 = zi * zi;
+        ++iter;
+    }
+
+    unsigned char r, g, b;
+    if (iter >= max_iter) {
+        // Inside the set (or close enough): solid black.
+        r = 0;
+        g = 0;
+        b = 0;
+    } else {
+        // Smooth iteration count:
+        //   mu = iter + 1 - log(log(|z|)) / log(2)
+        //      = iter + 1 - log(0.5 * log(|z|^2)) / log(2)
+        // At escape, |z|^2 > 4, so 0.5 * log(|z|^2) > log(2) > 0 -- the
+        // outer log is well-defined. Compute in double, narrow to float
+        // for the palette lookup.
+        double log_zn = 0.5 * log(zr2 + zi2);
+        double nu = log(log_zn) / log(2.0);
+        float mu = (float)((double)(iter + 1) - nu);
+
+        // Cycle through the palette: 0.02 controls how quickly we wrap
+        // through the gradient as the iteration count climbs.
+        float t = fmodf(mu * 0.02f, 1.0f);
+        if (t < 0.0f) t += 1.0f;  // fmodf can return negative for negative mu
+
+        float4 rgba = tex1D<float4>(palette, t);
+
+        // Clamp before narrowing to bytes.
+        float fr = rgba.x; if (fr < 0.0f) fr = 0.0f; if (fr > 1.0f) fr = 1.0f;
+        float fg = rgba.y; if (fg < 0.0f) fg = 0.0f; if (fg > 1.0f) fg = 1.0f;
+        float fb = rgba.z; if (fb < 0.0f) fb = 0.0f; if (fb > 1.0f) fb = 1.0f;
+        r = (unsigned char)(fr * 255.0f);
+        g = (unsigned char)(fg * 255.0f);
+        b = (unsigned char)(fb * 255.0f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = r;
+    output[idx + 1] = g;
+    output[idx + 2] = b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
new file mode 100644
index 00000000000..38b09513464
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -0,0 +1,717 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the new cuda.core texture/surface stack:
+# MipmappedArray, SurfaceObject, and a TextureObject that does trilinear
+# (LINEAR mipmap + LINEAR filter) sampling with user-controlled LOD bias.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to allocate a mipmap pyramid as a single MipmappedArray, populate each
+# level from a CUDA kernel by binding it as a SurfaceObject, and then sample
+# the whole pyramid from a TextureObject with manual LOD bias.
+#
+# How it works
+# ============
+# A mipmap pyramid is a stack of progressively-halved images of the same
+# texture. The base level (level 0) holds the highest-resolution version; each
+# subsequent level is a 2x2 box-filtered downsample of the level below it:
+#
+#     level 0: 512 x 512   <- highest detail
+#     level 1: 256 x 256
+#     level 2: 128 x 128
+#     ...
+#     level 9:   1 x 1     <- a single average color
+#
+# At sample time, the GPU picks the mip level that best matches the on-screen
+# size of the texel, optionally blending between adjacent levels (trilinear).
+# Selecting a coarser level than the "right" one is called a positive LOD bias
+# and produces a softer/blurrier image; a negative bias selects finer levels
+# (sharper but more aliased when undersampled).
+#
+#   +----------------------+       +-----------------------+
+#   |   MipmappedArray     |       |   TextureObject       |
+#   | (single allocation,  | <---  | (samples the whole    |
+#   |  10 mip levels)      |       |  pyramid w/ trilinear |
+#   +----------------------+       |  filtering)           |
+#         ^      ^                 +-----------------------+
+#         |      |
+#         |      +---- one SurfaceObject per level, used at BUILD time only
+#         |            to let a kernel write pixels into that level.
+#         |
+#         +----------- get_level(L) returns a NON-OWNING Array view of level L;
+#                      the storage belongs to the parent MipmappedArray.
+#
+#   STARTUP -- one-time mipmap build
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate MipmappedArray (10 levels, float4 RGBA, surface_load_store=True).
+#   2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency
+#      procedural pattern.
+#   3. For L = 1..num_levels-1: launch `downsample` kernel:
+#        - reads level L-1 through a TextureObject (POINT-filtered)
+#        - writes level L   through a SurfaceObject
+#        - 4-sample box average of the parent's 2x2 footprint.
+#
+#   PER FRAME (render loop)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   The display TextureObject samples the whole pyramid with `tex2DLod`,
+#   where the LOD is computed per-pixel as `log2(zoom) + lod_bias`. The result
+#   is written to a GL PBO via GraphicsResource, then drawn as a textured quad.
+#
+# What you should see
+# ===================
+# A 512x512 procedural pattern (concentric rings + diagonal grid) shown
+# stretched across the window. Use the mouse wheel to zoom in/out (this
+# implicitly changes the LOD), and use the bracket keys `[` / `]` to add a
+# manual LOD bias on top of that. Press `R` to reset.
+#
+#   Mouse wheel       zoom in / out
+#   [                 LOD bias -= 0.25  (sharper, more aliased)
+#   ]                 LOD bias += 0.25  (blurrier, samples a coarser level)
+#   R                 reset zoom + bias
+#   Escape / close    quit
+#
+# The window title shows the current zoom, manual bias, and effective LOD.
+# Close the window or press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    MipmappedArray,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BASE_SIZE = 512  # Texture base-level edge length (must be a power of two).
+LOD_BIAS_STEP = 0.25
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA, OpenGL, and the mipmap pyramid. If you're
+# here to learn about MipmappedArray / SurfaceObject / mipmapped TextureObject,
+# you can skip straight to main() -- the interesting part is there. These
+# helpers exist so that main() reads like a short story.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """Surface load/store + mipmapped arrays require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the three kernels and return everything we need to drive them.
+
+    Returns
+    -------
+    (dev, stream, kernels, arch_str)
+        kernels is a dict with keys "seed_base", "downsample", "display".
+    """
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("seed_base", "downsample", "display"),
+    )
+    kernels = {
+        "seed_base": mod.get_kernel("seed_base"),
+        "downsample": mod.get_kernel("downsample"),
+        "display": mod.get_kernel("display"),
+    }
+    return dev, stream, kernels, f"sm_{dev.arch}"
+
+
+def build_mipmap_pyramid(mip, num_levels, stream, kernels):
+    """Populate every level of `mip` using SurfaceObject writes.
+
+    Strategy
+    --------
+    * Level 0 is filled directly by `seed_base`, which writes a procedural
+      pattern through a SurfaceObject bound to level 0.
+    * Each subsequent level L is filled by `downsample`, which reads level L-1
+      through a POINT-filtered TextureObject and box-averages a 2x2 footprint
+      into level L through a SurfaceObject.
+    * All operations are issued on a single stream, so they serialize
+      implicitly -- no per-level sync is needed.
+    """
+    # ---- Level 0: seed the base image -------------------------------------
+    base_arr = mip.get_level(0)  # non-owning view; do NOT use a `with` block
+    with SurfaceObject.from_array(base_arr) as base_surf:
+        block = (16, 16, 1)
+        grid = (
+            (BASE_SIZE + block[0] - 1) // block[0],
+            (BASE_SIZE + block[1] - 1) // block[1],
+            1,
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["seed_base"],
+            np.uint64(base_surf.handle),
+            np.int32(BASE_SIZE),
+            np.int32(BASE_SIZE),
+        )
+    # base_arr (non-owning) is allowed to fall out of scope here; the parent
+    # MipmappedArray keeps the underlying storage alive.
+
+    # ---- Levels 1..N-1: box-filter downsample ------------------------------
+    # Each iteration reads level (L-1) through a temporary TextureObject and
+    # writes level L through a temporary SurfaceObject. Both close cleanly
+    # at the end of their `with` blocks.
+    src_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,        # explicit per-texel reads
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,             # integer pixel coordinates
+    )
+    for level in range(1, num_levels):
+        parent_size = BASE_SIZE >> (level - 1)
+        level_size = BASE_SIZE >> level
+        if level_size < 1:
+            break
+
+        src_arr = mip.get_level(level - 1)
+        dst_arr = mip.get_level(level)
+        src_res = ResourceDescriptor.from_array(src_arr)
+        with TextureObject.from_descriptor(
+            resource=src_res, texture_descriptor=src_tex_desc
+        ) as src_tex, SurfaceObject.from_array(dst_arr) as dst_surf:
+            block = (16, 16, 1)
+            grid = (
+                (level_size + block[0] - 1) // block[0],
+                (level_size + block[1] - 1) // block[1],
+                1,
+            )
+            launch(
+                stream,
+                LaunchConfig(grid=grid, block=block),
+                kernels["downsample"],
+                np.uint64(src_tex.handle),
+                np.uint64(dst_surf.handle),
+                np.int32(parent_size),
+                np.int32(level_size),
+            )
+        # src_arr, dst_arr (non-owning) fall out of scope; storage stays
+        # alive via the parent MipmappedArray.
+
+    # One sync at the end is enough -- the whole build chain ran on this
+    # stream and serialized naturally.
+    stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="MipmappedArray Example - Mipmap LOD viewer",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: a shader program, a fullscreen quad, and an
+    empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, _arch = setup_cuda()
+
+    # --- Step 2: Allocate the mipmap pyramid and build every level ---
+    #     surface_load_store=True is required for kernel-side writes.
+    num_levels = int(math.log2(BASE_SIZE)) + 1
+    mip = MipmappedArray.from_descriptor(
+        shape=(BASE_SIZE, BASE_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        num_levels=num_levels,
+        surface_load_store=True,
+    )
+    build_mipmap_pyramid(mip, num_levels, stream, kernels)
+
+    # --- Step 3: Bind the WHOLE pyramid as a trilinear-filtered texture ---
+    #     Normalized coordinates (0..1) make zoom-by-uv simple. The texture
+    #     descriptor's mipmap_level_bias stays 0.0; the display kernel
+    #     receives the user-controlled bias as a kernel argument and folds
+    #     it into the tex2DLod call (avoids rebuilding the TextureObject
+    #     whenever the user changes the bias).
+    display_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+        mipmap_filter_mode=FilterMode.LINEAR,    # trilinear
+        mipmap_level_bias=0.0,
+        min_mipmap_level_clamp=0.0,
+        max_mipmap_level_clamp=float(num_levels - 1),
+    )
+    display_tex = TextureObject.from_descriptor(
+        resource=ResourceDescriptor.from_mipmapped_array(mip),
+        texture_descriptor=display_tex_desc,
+    )
+
+    # --- Step 4: Open a window and set up the GL/CUDA bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # `zoom` controls how big a texel is on screen: zoom > 1 stretches the
+    # texture and selects coarser mip levels (positive LOD); zoom < 1 shrinks
+    # the texture and selects finer levels. `lod_bias` is a manual offset
+    # added on top.
+    state = {"zoom": 1.0, "lod_bias": 0.0}
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    def effective_lod():
+        # Same formula the display kernel uses, clamped to the legal range so
+        # the window title matches what the GPU actually sees.
+        raw = math.log2(max(state["zoom"], 1e-6)) + state["lod_bias"]
+        return max(0.0, min(float(num_levels - 1), raw))
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the display kernel -- samples the mipmap and writes RGBA.
+            launch(
+                stream,
+                config,
+                kernels["display"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(display_tex.handle),
+                np.float32(state["zoom"]),
+                np.float32(state["lod_bias"]),
+                np.float32(float(num_levels - 1)),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 1.0:
+            fps = frame_count[0] / (now - fps_time[0])
+            window.set_caption(
+                f"MipmappedArray LOD viewer "
+                f"({WIDTH}x{HEIGHT}, {fps:.0f} FPS) -- "
+                f"zoom={state['zoom']:.2f}, "
+                f"bias={state['lod_bias']:+.2f}, "
+                f"LOD={effective_lod():.2f}"
+            )
+            frame_count[0] = 0
+            fps_time[0] = now
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range.
+        if scroll_y == 0:
+            return
+        factor = 1.125 ** scroll_y
+        state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor))
+
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.BRACKETLEFT:
+            state["lod_bias"] = max(
+                -float(num_levels), state["lod_bias"] - LOD_BIAS_STEP
+            )
+        elif symbol == key.BRACKETRIGHT:
+            state["lod_bias"] = min(
+                float(num_levels), state["lod_bias"] + LOD_BIAS_STEP
+            )
+        elif symbol == key.R:
+            state["zoom"] = 1.0
+            state["lod_bias"] = 0.0
+
+    @window.event
+    def on_close():
+        # Release CUDA-side resources in reverse construction order. GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        display_tex.close()
+        mip.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Three CUDA kernels are concatenated into one program string so they share a
+# single NVRTC compile. All three operate on float4 RGBA pixels.
+#
+#   seed_base   -- writes a high-frequency procedural pattern to level 0 via a
+#                  SurfaceObject. NOTE: surf2Dwrite's x-coordinate is in BYTES,
+#                  not in elements, so we multiply by sizeof(float4) every time.
+#
+#   downsample  -- reads level L-1 through a POINT-filtered TextureObject and
+#                  writes the 2x2 box average to level L through a SurfaceObject.
+#                  tex2D with non-normalized coords needs the +0.5 half-texel
+#                  offset to hit exact texel centers.
+#
+#   display     -- samples the WHOLE mipmap pyramid with tex2DLod, where the
+#                  per-thread LOD is `clamp(log2(zoom) + lod_bias, 0, maxLod)`.
+#                  Writes 8-bit RGBA into the PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Helper: clamp a float to [a, b].
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+// CUDA does not ship a builtin "fract" so we provide one (used by seed_base).
+__device__ __forceinline__ float fracf(float v) {
+    return v - floorf(v);
+}
+
+// --------------------------------------------------------------------------
+// seed_base: write a procedural high-frequency pattern to level 0.
+//
+// surf is a SurfaceObject bound to the level-0 Array (float4 RGBA). The
+// pattern is a colorful blend of concentric rings, a diagonal grid, and a
+// radial sweep, designed to have plenty of fine detail so the difference
+// between mip levels is visually obvious.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void seed_base(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Concentric rings centered on the image.
+    float cx = u - 0.5f;
+    float cy = v - 0.5f;
+    float r = sqrtf(cx * cx + cy * cy);
+    float rings = 0.5f + 0.5f * sinf(r * 80.0f);
+
+    // Diagonal grid -- thin lines about every 1/16 of the image.
+    float gx = fabsf(fracf(u * 16.0f) - 0.5f);
+    float gy = fabsf(fracf(v * 16.0f) - 0.5f);
+    float grid = (gx < 0.05f || gy < 0.05f) ? 1.0f : 0.0f;
+
+    // Angular sweep gives the rings some color variation.
+    float theta = atan2f(cy, cx);
+    float sweep = 0.5f + 0.5f * sinf(theta * 6.0f);
+
+    // Combine into an RGBA color. Keep values in [0, 1].
+    float red   = clampf(rings * (0.4f + 0.6f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float green = clampf(rings * (0.6f - 0.4f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float blue  = clampf(0.4f + 0.4f * sweep + 0.5f * grid,            0.0f, 1.0f);
+    float alpha = 1.0f;
+
+    float4 px = make_float4(red, green, blue, alpha);
+
+    // Surface writes index x in BYTES (this is the classic gotcha).
+    surf2Dwrite<float4>(px, surf, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// downsample: box-filter a 2x2 footprint of the parent level into one texel.
+//
+// src is a POINT-filtered TextureObject bound to level (L-1).
+// dst is a SurfaceObject bound to level L.
+// (dst_w, dst_h) is the size of level L.
+// (src_w = 2 * dst_w, src_h = 2 * dst_h is implicit and unused; we pass it
+// only for the bounds check.)
+//
+// Texture coordinates: tex2D with non-normalized coords returns texel (i, j)
+// when sampled at (i + 0.5, j + 0.5). So for output texel (x, y) the four
+// parent texels live at parent-coords (2x + 0.5, 2y + 0.5), (2x + 1.5, ...).
+// --------------------------------------------------------------------------
+extern "C" __global__
+void downsample(cudaTextureObject_t src,
+                cudaSurfaceObject_t dst,
+                int src_size,
+                int dst_size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_size || y >= dst_size) return;
+
+    float fx = 2.0f * (float)x;
+    float fy = 2.0f * (float)y;
+
+    float4 a = tex2D<float4>(src, fx + 0.5f, fy + 0.5f);
+    float4 b = tex2D<float4>(src, fx + 1.5f, fy + 0.5f);
+    float4 c = tex2D<float4>(src, fx + 0.5f, fy + 1.5f);
+    float4 d = tex2D<float4>(src, fx + 1.5f, fy + 1.5f);
+
+    float4 px;
+    px.x = 0.25f * (a.x + b.x + c.x + d.x);
+    px.y = 0.25f * (a.y + b.y + c.y + d.y);
+    px.z = 0.25f * (a.z + b.z + c.z + d.z);
+    px.w = 0.25f * (a.w + b.w + c.w + d.w);
+
+    // Silence unused-variable warning for the convenience parameter.
+    (void)src_size;
+
+    surf2Dwrite<float4>(px, dst, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// display: per-pixel mipmap sample with manual LOD bias.
+//
+// tex is a TextureObject built from the whole MipmappedArray (LINEAR +
+// LINEAR mipmap filter, normalized coords). For each output pixel we compute
+// a single per-thread LOD from `zoom` and `lod_bias`, then sample with
+// tex2DLod. Output is written as RGBA8 into a linear byte buffer.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void display(unsigned char *output,
+             int width,
+             int height,
+             cudaTextureObject_t tex,
+             float zoom,
+             float lod_bias,
+             float max_lod) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized window coords in [0, 1].
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Zoom around the window center so the user sees the effect symmetrically.
+    u = (u - 0.5f) * zoom + 0.5f;
+    v = (v - 0.5f) * zoom + 0.5f;
+
+    // LOD: zoom > 1 means the texture is being stretched (each texel covers
+    // more screen area), which intuitively corresponds to selecting a coarser
+    // (higher) mip level. log2(zoom) yields exactly that. lod_bias is added
+    // on top, and the final value is clamped to the legal range.
+    float lod = log2f(fmaxf(zoom, 1e-6f)) + lod_bias;
+    lod = clampf(lod, 0.0f, max_lod);
+
+    float4 c = tex2DLod<float4>(tex, u, v, lod);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(clampf(c.x, 0.0f, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(clampf(c.y, 0.0f, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(clampf(c.z, 0.0f, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
new file mode 100644
index 00000000000..177e7b8d320
--- /dev/null
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -0,0 +1,836 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A real-time
+# Gerstner-wave ocean is rebuilt every frame: a heightmap Array is rewritten
+# through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP
+# filtering for normal estimation, and shaded with Phong + Fresnel sky
+# reflection straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a CUDA Array as a typed heightmap that is simultaneously
+#   written by one kernel (via SurfaceObject) and sampled by another (via
+#   TextureObject) within the same frame.
+# - How LINEAR filtering + WRAP addressing + normalized coordinates gives
+#   essentially-free bilinear neighbor lookups for finite-difference normal
+#   estimation on a tiling heightmap.
+# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire render path never leaves the GPU.
+#
+# How it works
+# ============
+# Gerstner waves are a sum of N moving sinusoids with directional vectors --
+# a classic ocean approximation that looks shockingly close to FFT ocean at a
+# glance without any external library dependencies. For each heightmap texel:
+#
+#     h(x, z, t) = sum_i  A_i * sin( D_i . (x, z) * k_i  -  w_i * t  +  phi_i )
+#
+# where k_i = 2*pi / wavelength_i and w_i = sqrt(g * k_i) is the dispersion
+# relation for deep-water gravity waves. We bake 12 waves with hand-picked
+# directions / wavelengths / amplitudes / phases into the kernel as constant
+# arrays. Weather presets just scale amplitude and speed at the host level.
+#
+#   PER FRAME (all on GPU)
+#   ~~~~~~~~~~~~~~~~~~~~~~
+#   +-----------------+   surf2Dwrite   +--------------+
+#   |   update_height | --------------> |  heightmap   |
+#   |     kernel      |                 |    Array     |
+#   +-----------------+                 |  (FLOAT32)   |
+#                                       +--------------+
+#                                              |
+#                                              | tex2D<float> (LINEAR + WRAP)
+#                                              v
+#                                       +-----------------+    write RGBA8
+#                                       |  render_ocean   | ----------------> PBO
+#                                       |     kernel      |
+#                                       +-----------------+
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# WRAP / MIRROR addressing modes require normalized coordinates (see the CUDA
+# Programming Guide). The ocean naturally tiles, so WRAP gives free seamless
+# horizon repetition. LINEAR filtering means our four-tap finite-difference
+# normal estimate gets bilinear interpolation between texels for free, which
+# smooths the lighting noticeably without a single extra ALU instruction.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# surf2Dwrite takes the x coordinate in BYTES, not in elements. For a
+# single-channel float surface that means `x * sizeof(float)` = `x * 4`.
+# Getting this wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing a real-time animated ocean rendered with Phong shading and
+# a Fresnel-modulated sky reflection. Drag with the left mouse button to
+# orbit, scroll to zoom, press 1/2/3 to switch weather presets (calm /
+# breezy / stormy), press P to pause animation, Escape to exit. Window title
+# shows preset name and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and heightmap dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+GRID = 512  # heightmap resolution (GRID x GRID texels)
+
+# Weather presets: (amplitude_scale, speed_scale, label).
+# These are applied as multiplicative scalars on top of the per-wave amplitude
+# and angular-frequency arrays baked into the kernel, so a single compiled
+# binary can render every preset.
+PRESETS = {
+    "1": (0.35, 0.7, "calm"),
+    "2": (1.00, 1.0, "breezy"),
+    "3": (1.85, 1.4, "stormy"),
+}
+DEFAULT_PRESET = "2"
+
+# Initial camera (orbit-around-origin) parameters.
+INITIAL_YAW = 0.6        # radians around world-y
+INITIAL_PITCH = 0.35     # radians above the horizon (small positive = looking down)
+INITIAL_DISTANCE = 5.0   # camera distance from origin
+PITCH_LIMIT = 1.4        # clamp |pitch| to keep basis non-degenerate (< pi/2)
+ZOOM_MIN = 1.5
+ZOOM_MAX = 30.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    The two kernels live on different grids:
+      - update_height runs over the heightmap (GRID x GRID texels).
+      - render_ocean  runs over output pixels  (WIDTH x HEIGHT).
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("update_height", "render_ocean"),
+    )
+
+    kernels = {
+        "update": mod.get_kernel("update_height"),
+        "render": mod.get_kernel("render_ocean"),
+    }
+
+    block = (16, 16, 1)
+    update_grid = (
+        (GRID + block[0] - 1) // block[0],
+        (GRID + block[1] - 1) // block[1],
+        1,
+    )
+    render_grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    configs = {
+        "update": LaunchConfig(grid=update_grid, block=block),
+        "render": LaunchConfig(grid=render_grid, block=block),
+    }
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Gerstner Ocean",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate -- not CUDA-specific. Returns
+    (shader_program, vao_id, tex_id). The shader_program is a pyglet
+    ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window).
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heightmap_array():
+    """Allocate the single-channel float heightmap Array."""
+    return Array.from_descriptor(
+        shape=(GRID, GRID),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+
+
+def make_height_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def orbit_camera_position(yaw, pitch, distance):
+    """Convert (yaw, pitch, distance) to a world-space camera position.
+
+    The camera orbits the origin looking at it. World up is +y. Pitch is the
+    angle above the xz-plane: pitch=0 puts the camera on the horizon,
+    pitch=+1.4 nearly directly overhead.
+    """
+    cp = math.cos(pitch)
+    sp = math.sin(pitch)
+    cy = math.cos(yaw)
+    sy = math.sin(yaw)
+    cam_x = distance * cp * sy
+    cam_y = distance * sp
+    cam_z = distance * cp * cy
+    return cam_x, cam_y, cam_z
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the heightmap Array and build its texture/surface ---
+    #     We pre-create both the TextureObject (read path) and the
+    #     SurfaceObject (write path) once and reuse them every frame. Creating
+    #     them inside the per-frame loop would work but adds per-frame overhead
+    #     and risks lifetime issues with async kernel launches.
+    height_arr = make_heightmap_array()
+    height_tex = make_height_texture(height_arr)
+    height_surf = SurfaceObject.from_array(height_arr)
+
+    # --- Step 7: Camera + animation state ---
+    state = {
+        "preset": DEFAULT_PRESET,
+        "yaw": INITIAL_YAW,
+        "pitch": INITIAL_PITCH,
+        "distance": INITIAL_DISTANCE,
+        "drag": False,
+        "paused": False,
+        "t_anim": 0.0,
+        "t_prev": time.monotonic(),
+    }
+
+    # --- Step 8: Render loop ---
+    frame_count = 0
+    fps_time = state["t_prev"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # Advance animation time only when not paused, so pausing freezes the
+        # ocean exactly where it was rather than letting it lurch when resumed.
+        now = time.monotonic()
+        dt = now - state["t_prev"]
+        state["t_prev"] = now
+        if not state["paused"]:
+            state["t_anim"] += dt
+        t = state["t_anim"]
+
+        amp_scale, speed_scale, _label = PRESETS[state["preset"]]
+
+        # (a) Rebuild the heightmap for time t.
+        launch(
+            stream,
+            configs["update"],
+            kernels["update"],
+            np.uint64(height_surf.handle),
+            np.int32(GRID),
+            np.int32(GRID),
+            np.float32(t),
+            np.float32(amp_scale),
+            np.float32(speed_scale),
+        )
+
+        # (b) Render the scene: sample the heightmap through the texture,
+        #     estimate normals via finite differences, shade with Phong +
+        #     Fresnel sky reflection, write RGBA8 into the OpenGL PBO.
+        cam_x, cam_y, cam_z = orbit_camera_position(
+            state["yaw"], state["pitch"], state["distance"]
+        )
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["render"],
+                kernels["render"],
+                np.uint64(height_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(cam_x),
+                np.float32(cam_y),
+                np.float32(cam_z),
+                np.float32(t),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) PBO -> GL texture (GPU-to-GPU).
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            paused = " [paused]" if state["paused"] else ""
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Gerstner Ocean"
+                f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    # --- Mouse: drag to orbit, scroll to zoom ------------------------------
+    @window.event
+    def on_mouse_press(x, y, button, modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = True
+
+    @window.event
+    def on_mouse_release(x, y, button, modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = False
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw
+        # direction matches the camera moving with the cursor.
+        state["yaw"] -= dx * 0.005
+        state["pitch"] -= dy * 0.005
+        # Clamp pitch to keep the camera basis non-degenerate (never look
+        # straight down/up the world-y axis).
+        if state["pitch"] > PITCH_LIMIT:
+            state["pitch"] = PITCH_LIMIT
+        if state["pitch"] < -PITCH_LIMIT:
+            state["pitch"] = -PITCH_LIMIT
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # Geometric zoom in camera distance; clamp to a sensible range.
+        factor = 1.1 ** (-scroll_y)
+        new_d = state["distance"] * factor
+        state["distance"] = max(ZOOM_MIN, min(ZOOM_MAX, new_d))
+
+    # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ----------
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.P:
+            state["paused"] = not state["paused"]
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse order of creation.
+        resource.close()
+        height_tex.close()
+        height_surf.close()
+        height_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE contains two CUDA C++ kernels:
+#   - update_height: per-heightmap-texel. Sums 12 Gerstner waves and writes
+#                    one float per texel via SurfaceObject.
+#   - render_ocean:  per-screen-pixel. Builds a camera ray, intersects the
+#                    ocean plane (y=0), samples the heightmap via
+#                    TextureObject (LINEAR + WRAP), estimates the normal via
+#                    finite differences, and shades with Phong + Fresnel sky
+#                    reflection. Misses go to a vertical sky gradient.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws a
+# texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// ---------------------------------------------------------------------------
+// Wave bank: 12 Gerstner-ish waves with hand-picked parameters.
+//
+// Wavelengths span 0.05 .. 1.0 world units. Amplitudes decrease with
+// frequency so that long swells dominate and short ripples ride on top
+// (a rough Phillips/JONSWAP-style envelope, but coarsely hand-tuned for
+// visual punch rather than physical accuracy).
+//
+// Directions are spread non-uniformly around the unit circle to avoid the
+// streaky-grid look you get from evenly-spaced directions.
+// ---------------------------------------------------------------------------
+__constant__ float c_dirx[12] = {
+    1.000f,  0.866f,  0.500f,  0.000f, -0.500f, -0.866f,
+   -1.000f, -0.940f, -0.500f,  0.174f,  0.643f,  0.940f
+};
+__constant__ float c_dirz[12] = {
+    0.000f,  0.500f,  0.866f,  1.000f,  0.866f,  0.500f,
+    0.000f,  0.342f,  0.866f,  0.985f,  0.766f,  0.342f
+};
+__constant__ float c_wavelen[12] = {
+    1.000f, 0.730f, 0.520f, 0.380f, 0.260f, 0.190f,
+    0.140f, 0.105f, 0.085f, 0.070f, 0.058f, 0.050f
+};
+__constant__ float c_amp[12] = {
+    0.080f, 0.060f, 0.045f, 0.034f, 0.025f, 0.018f,
+    0.013f, 0.010f, 0.0075f, 0.0055f, 0.0040f, 0.0030f
+};
+__constant__ float c_phase[12] = {
+    0.00f, 1.20f, 2.10f, 0.40f, 3.70f, 5.10f,
+    2.65f, 4.85f, 1.55f, 6.05f, 3.20f, 0.95f
+};
+
+// Deep-water dispersion: w = sqrt(g * k), with k = 2*pi / wavelength.
+__device__ __forceinline__ float angular_freq(float wavelength) {
+    const float G = 9.81f;
+    float k = 6.2831853f / wavelength;
+    return sqrtf(G * k);
+}
+
+// World extent (in world units) covered by one tile of the heightmap.
+// The heightmap WRAPs, so the ocean tiles seamlessly every TILE world units.
+__device__ __forceinline__ float tile_extent() { return 4.0f; }
+
+// ---------------------------------------------------------------------------
+// Tiny vec3 helpers. Kept inline + __forceinline__ so they stay free.
+// ---------------------------------------------------------------------------
+struct V3 { float x, y, z; };
+
+__device__ __forceinline__ V3 v3(float x, float y, float z) {
+    V3 r; r.x = x; r.y = y; r.z = z; return r;
+}
+__device__ __forceinline__ V3 v_add(V3 a, V3 b) {
+    return v3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+__device__ __forceinline__ V3 v_sub(V3 a, V3 b) {
+    return v3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+__device__ __forceinline__ V3 v_scale(V3 a, float s) {
+    return v3(a.x * s, a.y * s, a.z * s);
+}
+__device__ __forceinline__ float v_dot(V3 a, V3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+__device__ __forceinline__ V3 v_cross(V3 a, V3 b) {
+    return v3(a.y * b.z - a.z * b.y,
+              a.z * b.x - a.x * b.z,
+              a.x * b.y - a.y * b.x);
+}
+__device__ __forceinline__ V3 v_normalize(V3 a) {
+    float inv = rsqrtf(fmaxf(v_dot(a, a), 1e-20f));
+    return v_scale(a, inv);
+}
+
+// ---------------------------------------------------------------------------
+// update_height: each thread computes one heightmap texel.
+//
+// Sums the 12 Gerstner waves at world position (x, z), using the
+// amplitude_scale and speed_scale knobs to switch between weather presets
+// without recompiling the kernel. Writes one float via surf2Dwrite.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void update_height(cudaSurfaceObject_t surf,
+                   int width, int height,
+                   float t,
+                   float amp_scale, float speed_scale) {
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= width || iy >= height) return;
+
+    // Map texel (ix, iy) to world position (x, z) inside one tile.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float te = tile_extent();
+    float wx = ((float)ix + 0.5f) * inv_w * te;
+    float wz = ((float)iy + 0.5f) * inv_h * te;
+
+    float h = 0.0f;
+    #pragma unroll
+    for (int i = 0; i < 12; ++i) {
+        float k = 6.2831853f / c_wavelen[i];
+        float w = angular_freq(c_wavelen[i]) * speed_scale;
+        float arg = (c_dirx[i] * wx + c_dirz[i] * wz) * k - w * t + c_phase[i];
+        h += c_amp[i] * sinf(arg);
+    }
+    h *= amp_scale;
+
+    // Single-channel float surface: byte offset is x * sizeof(float).
+    surf2Dwrite(h, surf, ix * (int)sizeof(float), iy);
+}
+
+// ---------------------------------------------------------------------------
+// Sample the heightmap at a world position. Texture is normalized + WRAP,
+// so we just divide world coords by tile_extent. WRAP gives us the tiling
+// for free at the horizon.
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ float sample_height(cudaTextureObject_t tex,
+                                               float wx, float wz) {
+    float inv_te = 1.0f / tile_extent();
+    return tex2D<float>(tex, wx * inv_te, wz * inv_te);
+}
+
+// ---------------------------------------------------------------------------
+// Sky gradient: a vertical interpolation from a soft horizon to a deeper
+// overhead blue. `up_angle` is in [-1, 1] (the y component of the ray dir).
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ V3 sky_color(float up_angle) {
+    // Clamp to [0, 1] so straight-down rays still get a horizon color.
+    float a = fmaxf(0.0f, fminf(1.0f, up_angle));
+    // Soft pale-blue horizon
+    V3 horizon = v3(0.70f, 0.82f, 0.92f);
+    // Deeper blue overhead
+    V3 zenith  = v3(0.18f, 0.34f, 0.62f);
+    // Curve so the gradient isn't linear -- horizon stays brighter longer.
+    float t = powf(a, 0.6f);
+    return v_add(v_scale(horizon, 1.0f - t), v_scale(zenith, t));
+}
+
+// ---------------------------------------------------------------------------
+// render_ocean: each thread shades one screen pixel.
+//
+// 1. Reconstruct the camera basis from cam_pos (orbiting origin, world-up).
+// 2. Build a perspective ray through the pixel.
+// 3. Intersect ray with y = 0 plane; if it misses, return sky gradient.
+// 4. Sample heightmap at hit point; finite-difference for the normal.
+// 5. Phong diffuse + specular, blended with Fresnel sky reflection.
+// 6. Write RGBA8 into the OpenGL PBO.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void render_ocean(cudaTextureObject_t tex,
+                  unsigned char* out,
+                  int w, int h,
+                  float cam_x, float cam_y, float cam_z,
+                  float /*t*/) {
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    if (px >= w || py >= h) return;
+
+    // ---- Camera basis ----
+    // Forward looks from cam_pos toward origin. World up is +y.
+    // cam_y > 0 guarantees forward.y < 0 and the cross product with world-up
+    // is well-defined (the pitch is clamped on the host side).
+    V3 cam_pos = v3(cam_x, cam_y, cam_z);
+    V3 forward = v_normalize(v_sub(v3(0.0f, 0.0f, 0.0f), cam_pos));
+    V3 world_up = v3(0.0f, 1.0f, 0.0f);
+    V3 right = v_normalize(v_cross(forward, world_up));
+    V3 cam_up = v_cross(right, forward);
+
+    // ---- Pixel ray (perspective) ----
+    float aspect = (float)w / (float)h;
+    float fov = 1.0472f;                 // 60 degrees vertical FoV
+    float scale = tanf(fov * 0.5f);
+    float ndc_x = (2.0f * ((float)px + 0.5f) / (float)w - 1.0f) * aspect * scale;
+    float ndc_y = (1.0f - 2.0f * ((float)py + 0.5f) / (float)h) * scale;
+    V3 dir = v_normalize(v_add(v_add(forward,
+                                     v_scale(right, ndc_x)),
+                               v_scale(cam_up, ndc_y)));
+
+    // ---- Background sky if the ray misses the ocean plane ----
+    // The ocean is the y=0 plane; we only count hits with rays going downward
+    // (dir.y < 0). Anything else is sky. A small eps avoids near-horizontal
+    // rays producing absurd hit distances.
+    V3 col;
+    const float HIT_EPS = 1e-3f;
+    if (dir.y > -HIT_EPS) {
+        col = sky_color(dir.y);
+    } else {
+        // ---- Hit the ocean plane ----
+        float t_hit = -cam_y / dir.y;
+        if (t_hit <= 0.0f) {
+            // Camera under the surface -- treat as sky to avoid garbage.
+            col = sky_color(dir.y);
+        } else {
+            V3 p = v_add(cam_pos, v_scale(dir, t_hit));
+
+            // ---- Sample heightmap; estimate normal via finite differences ----
+            // The heightmap tiles every tile_extent() world units (WRAP), so
+            // we use a small world-space epsilon. Four taps -> central
+            // differences in x and z.
+            const float FD = 0.01f;
+            float h_c = sample_height(tex, p.x,       p.z);
+            float h_xp = sample_height(tex, p.x + FD, p.z);
+            float h_xm = sample_height(tex, p.x - FD, p.z);
+            float h_zp = sample_height(tex, p.x,      p.z + FD);
+            float h_zm = sample_height(tex, p.x,      p.z - FD);
+            float dh_dx = (h_xp - h_xm) / (2.0f * FD);
+            float dh_dz = (h_zp - h_zm) / (2.0f * FD);
+            // Normal of the surface y = h(x, z) is (-dh/dx, 1, -dh/dz).
+            V3 N = v_normalize(v3(-dh_dx, 1.0f, -dh_dz));
+
+            // ---- Lighting ----
+            V3 L = v_normalize(v3(0.55f, 0.65f, 0.35f));   // sun: high+side
+            V3 V = v_normalize(v_sub(cam_pos, p));         // view direction
+            // Reflect L about N: R = 2*(N.L)*N - L
+            float ndotl = fmaxf(0.0f, v_dot(N, L));
+            V3 R = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, L)), L));
+
+            // Phong specular highlight on wave crests.
+            float spec = powf(fmaxf(0.0f, v_dot(R, V)), 32.0f);
+
+            // Diffuse: deep-sea blue-green.
+            V3 deep = v3(0.04f, 0.18f, 0.28f);
+            V3 shallow = v3(0.10f, 0.32f, 0.42f);
+            // Tiny height-based shading bias so crests look slightly brighter.
+            float tint = 0.5f + 0.5f * fmaxf(-1.0f, fminf(1.0f, h_c * 6.0f));
+            V3 base = v_add(v_scale(deep, 1.0f - tint),
+                            v_scale(shallow, tint));
+
+            // Diffuse term + ambient.
+            V3 diffuse = v_add(v_scale(base, 0.18f),
+                               v_scale(base, 0.82f * ndotl));
+
+            // Fresnel-modulated sky reflection. Sample the sky in the
+            // reflected-view direction so reflections of overhead show
+            // overhead colors, etc. View reflection: Rv = 2*(N.V)*N - V.
+            float ndotv = fmaxf(0.0f, v_dot(N, V));
+            V3 Rv = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, V)), V));
+            V3 reflected_sky = sky_color(fmaxf(0.0f, Rv.y));
+            float F = powf(1.0f - ndotv, 5.0f);
+            // Clamp Fresnel just in case of NaN-prone edge cases.
+            if (F < 0.0f) F = 0.0f;
+            if (F > 1.0f) F = 1.0f;
+
+            // Blend: more reflection at grazing angles.
+            V3 lit = v_add(v_scale(diffuse, 1.0f - F),
+                           v_scale(reflected_sky, F));
+
+            // Add specular highlight (sun color).
+            V3 sun_col = v3(1.0f, 0.96f, 0.85f);
+            col = v_add(lit, v_scale(sun_col, spec));
+        }
+    }
+
+    // ---- Tonemap + write ----
+    // Simple Reinhard-ish curve keeps highlights in [0, 1].
+    col.x = col.x / (1.0f + col.x);
+    col.y = col.y / (1.0f + col.y);
+    col.z = col.z / (1.0f + col.z);
+
+    int idx = (py * w + px) * 4;
+    out[idx + 0] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.x)) * 255.0f);
+    out[idx + 1] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.y)) * 255.0f);
+    out[idx + 2] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.z)) * 255.0f);
+    out[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
new file mode 100644
index 00000000000..b30603721a1
--- /dev/null
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -0,0 +1,727 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott
+# reaction-diffusion simulation is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a
+# SurfaceObject provides typed writes. The final state is colorized straight
+# into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a CUDA Array with `surface_load_store=True` so the same
+#   memory can be bound as both a TextureObject (for sampled reads) and a
+#   SurfaceObject (for typed writes).
+# - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates
+#   to get free hardware bilinear interpolation on a toroidal world.
+# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire simulation never leaves the GPU.
+#
+# How it works
+# ============
+# Gray-Scott is a two-species (U, V) reaction-diffusion system. At each cell
+# the rule is roughly:
+#
+#     du/dt = Du * laplacian(u) - u*v*v + F*(1 - u)
+#     dv/dt = Dv * laplacian(v) + u*v*v - (F + k)*v
+#
+# Different choices of F and k yield strikingly different patterns: coral,
+# mitosis, spots, and many more. We pack (U, V) into the two channels of a
+# `float2` Array.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float2>   +--------------+
+#   |   arr_a      | ----------------> |              |
+#   | (U, V) state |                   |  gray_scott  |
+#   +--------------+                   |    kernel    |
+#                                      |              |
+#   +--------------+   surf2Dwrite     |              |
+#   |   arr_b      | <---------------- |              |
+#   | (U, V) state |                   +--------------+
+#   +--------------+
+#       (swap)
+#
+# Each frame we do N_STEPS iterations of the kernel above, then run a separate
+# `colorize` kernel that samples V from the final state and writes RGBA bytes
+# straight into the OpenGL PBO via GraphicsResource. No data ever travels
+# across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Addressing modes WRAP and MIRROR are only supported with normalized
+# coordinates (see the CUDA Programming Guide and the SDK's
+# simplePitchLinearTexture sample). We use WRAP so that neighbor lookups at
+# the image edge automatically wrap around -- i.e. a torus. LINEAR filtering
+# is essentially free on the hardware and gives smoother diffusion than POINT
+# sampling would. We sample at the texel center `(x + 0.5) / W` so the
+# neighbor offsets line up exactly on integer texel positions.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this
+# wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing animated, organic-looking patterns growing and dividing
+# (think coral, spots, or mitosing cells). Press 1/2/3 to switch presets,
+# R to reseed, and Escape to exit. The window title shows the current FPS
+# and active preset.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+N_STEPS = 8  # Gray-Scott iterations per displayed frame
+DU = 0.16  # diffusion rate for U
+DV = 0.08  # diffusion rate for V
+DT = 1.0  # time step (Gray-Scott is stable at 1.0 with these D's)
+
+# Named presets: (F, k, label) tuples. F is the feed rate, k is the kill rate.
+# These are classic Gray-Scott regimes documented all over the literature.
+PRESETS = {
+    "1": (0.0545, 0.062, "coral"),
+    "2": (0.0367, 0.0649, "mitosis"),
+    "3": (0.030, 0.062, "spots"),
+}
+DEFAULT_PRESET = "1"
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("gray_scott_step", "colorize", "seed_initial"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("gray_scott_step"),
+        "colorize": mod.get_kernel("colorize"),
+        "seed": mod.get_kernel("seed_initial"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Gray-Scott Reaction Diffusion",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two `float2` ping-pong arrays that hold the (U, V) state."""
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, seed_value):
+    """Re-initialize the array behind `write_surf` with the Gray-Scott starting state.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.uint32(seed_value),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are `float2` (channel 0 = U, channel 1 = V) with
+    #     surface_load_store=True so they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Per advisor: doing this once is much cheaper than recreating them
+    #     every step. We keep both texture and surface handles for each
+    #     array; the simulation loop just picks which pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Seed the initial state into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0}
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, seed_value=state["seed"])
+            state["current"] = "a"
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        F, k, _label = PRESETS[state["preset"]]
+
+        # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one
+        #     array via a TextureObject (LINEAR + WRAP gives wrapping +
+        #     bilinear sampling) and writes to the other via a SurfaceObject.
+        for _ in range(N_STEPS):
+            tex_read, surf_write, next_current = current_read_write()
+            launch(
+                stream,
+                configs["step"],
+                kernels["step"],
+                np.uint64(tex_read.handle),
+                np.uint64(surf_write.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(DU),
+                np.float32(DV),
+                np.float32(F),
+                np.float32(k),
+                np.float32(DT),
+            )
+            state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Gray-Scott"
+                f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" {N_STEPS} steps/frame)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_initial   -- sets initial (U, V) state via SurfaceObject writes
+#       * gray_scott_step -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering) and writes the
+#                            next state via SurfaceObject. Coordinates are
+#                            normalized to [0, 1] because WRAP requires it.
+#       * colorize       -- reads the V channel via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            three-stop "magma-ish" gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Inverse texture dimensions are precomputed by the host and passed as
+// floats so the kernel can convert integer pixel coordinates to normalized
+// texture coordinates with a single multiply.
+
+extern "C"
+__global__
+void seed_initial(cudaSurfaceObject_t surf,
+                  int width, int height,
+                  unsigned int seed) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // U = 1 everywhere; V = 1 inside a ~40x40 centered square plus a small
+    // deterministic perturbation that breaks symmetry differently each reseed.
+    float u = 1.0f;
+    float v = 0.0f;
+
+    int half_w = width / 2;
+    int half_h = height / 2;
+    if (x >= half_w - 20 && x < half_w + 20 &&
+        y >= half_h - 20 && y < half_h + 20) {
+        v = 1.0f;
+        // Knock U down a bit inside the seed square so V can grow.
+        u = 0.5f;
+    }
+
+    // Cheap deterministic pseudo-random noise (xorshift on packed coords).
+    unsigned int h = (unsigned int)x * 374761393u +
+                     (unsigned int)y * 668265263u + seed * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float noise = (h & 0xffffu) / 65535.0f;   // in [0, 1]
+    v += 0.02f * (noise - 0.5f);              // small +/- jitter
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(make_float2(u, v), surf, x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void gray_scott_step(cudaTextureObject_t tex,
+                     cudaSurfaceObject_t surf,
+                     int width, int height,
+                     float Du, float Dv,
+                     float F, float k_kill,
+                     float dt) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized coordinates: WRAP addressing only works in normalized mode.
+    // Each texel center sits at ((i + 0.5) / W, (j + 0.5) / H).
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    // 5-point Laplacian stencil. LINEAR filtering does nothing extra here
+    // because the offsets land exactly on neighboring texel centers, but the
+    // toroidal WRAP at the boundary is essential for a periodic world.
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float2 l = tex2D<float2>(tex, cx - inv_w, cy);
+    float2 r = tex2D<float2>(tex, cx + inv_w, cy);
+    float2 u_n = tex2D<float2>(tex, cx, cy - inv_h);
+    float2 d_n = tex2D<float2>(tex, cx, cy + inv_h);
+
+    float lap_u = (l.x + r.x + u_n.x + d_n.x) - 4.0f * c.x;
+    float lap_v = (l.y + r.y + u_n.y + d_n.y) - 4.0f * c.y;
+
+    float u = c.x;
+    float v = c.y;
+    float uvv = u * v * v;
+
+    float du = Du * lap_u - uvv + F * (1.0f - u);
+    float dv = Dv * lap_v + uvv - (F + k_kill) * v;
+
+    float new_u = u + dt * du;
+    float new_v = v + dt * dv;
+
+    // Clamp to keep things numerically sane after long runs.
+    if (new_u < 0.0f) new_u = 0.0f;
+    if (new_u > 1.0f) new_u = 1.0f;
+    if (new_v < 0.0f) new_v = 0.0f;
+    if (new_v > 1.0f) new_v = 1.0f;
+
+    surf2Dwrite(make_float2(new_u, new_v), surf,
+                x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float v = c.y;
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Three-stop "magma-ish" gradient: dark purple -> orange -> pale yellow.
+    // Implemented as two linear interpolations stitched together at v = 0.5
+    // so the result is reasonably perceptually smooth without a lookup table.
+    float r, g, b;
+    if (v < 0.5f) {
+        float t = v * 2.0f;                  // [0, 1] over the low half
+        r = 0.05f + t * (0.85f - 0.05f);
+        g = 0.02f + t * (0.30f - 0.02f);
+        b = 0.20f + t * (0.10f - 0.20f);
+    } else {
+        float t = (v - 0.5f) * 2.0f;         // [0, 1] over the high half
+        r = 0.85f + t * (1.00f - 0.85f);
+        g = 0.30f + t * (0.95f - 0.30f);
+        b = 0.10f + t * (0.70f - 0.10f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
new file mode 100644
index 00000000000..05299cc278f
--- /dev/null
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -0,0 +1,827 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core's 3D Array + trilinear TextureObject by
+# baking a procedural Signed Distance Field (SDF) volume once at startup and
+# then ray-marching it every frame to render an orbitable 3D scene. The
+# SurfaceObject is used during the one-shot bake; the TextureObject (with
+# LINEAR + CLAMP + normalized coords) drives the per-frame ray march. The
+# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a 3D cuda.core.Array (cuArray3DCreate under the hood) and
+#   bind it as both a SurfaceObject (for one-shot kernel writes) and a
+#   TextureObject (for hardware-accelerated trilinear sampling).
+# - How to ray-march a baked SDF volume from a CUDA kernel, sampling via
+#   tex3D<float> and writing pixels straight into an OpenGL PBO.
+# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop.
+#
+# How it works
+# ============
+# The signed distance field of a "gyroid intersected with a sphere" is baked
+# once into a 128 x 128 x 128 single-channel float volume:
+#
+#     gyroid(p)   = sin(p.x*tau)cos(p.y*tau)
+#                 + sin(p.y*tau)cos(p.z*tau)
+#                 + sin(p.z*tau)cos(p.x*tau)
+#     sdf_gyroid  = |gyroid(p)| - 0.20         # slab around the gyroid surface
+#     sdf_sphere  = length(p) - 0.9            # bounding sphere
+#     sdf(p)      = max(sdf_gyroid, sdf_sphere) # CSG intersection
+#
+# where p in [-1, 1]^3 is the voxel's world-space position.
+#
+# Each frame, the render kernel emits one ray per pixel from an orbiting
+# camera, marches the volume in fixed voxel-sized steps (up to ~256), and on intersection
+# computes a normal by central differences of tex3D, then applies a simple
+# diffuse + ambient + specular shade. Misses fall back to a vertical sky
+# gradient.
+#
+#   STARTUP (one-shot bake)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate 3D Array (128^3, FLOAT32 x1, surface_load_store=True).
+#   2. Bind it as a SurfaceObject.
+#   3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite.
+#   4. Close the SurfaceObject; the Array stays alive.
+#
+#   EACH FRAME
+#   ~~~~~~~~~~
+#   1. resource.map() -> CUDA device pointer into the OpenGL PBO.
+#   2. Launch `render_sdf` (one thread per pixel). It samples the SDF via the
+#      long-lived TextureObject (LINEAR + CLAMP + normalized coords) using
+#      tex3D<float>. RGBA8 lands directly in the PBO.
+#   3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad.
+#
+# Controls
+# ========
+#   Left mouse drag    orbit camera (dx -> yaw, dy -> pitch)
+#   Mouse wheel        zoom (camera distance)
+#   R                  reset camera (yaw=0, pitch=0.3, dist=2.5)
+#   Escape / close     quit
+#
+# The window title shows yaw, pitch, distance, FPS, and ms/frame.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+VOLUME_SIZE = 128   # 128^3 voxels; bake cost is one-shot.
+
+# Camera defaults / clamps.
+RESET_YAW = 0.0
+RESET_PITCH = 0.3
+RESET_DIST = 2.5
+PITCH_MIN = -1.45    # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MAX = 1.45
+DIST_MIN = 1.2
+DIST_MAX = 8.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# 3D Array / TextureObject / SurfaceObject, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """3D arrays + bindless surface/texture objects require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the two kernels and return (device, stream, kernels)."""
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    # C++ is required so the templated tex3D<float> / surf3Dwrite<float>
+    # overloads resolve. extern "C" on the kernel symbols keeps the function
+    # names unmangled even when the rest of the TU is compiled as C++.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_sdf", "render_sdf"),
+    )
+    kernels = {
+        "bake": mod.get_kernel("bake_sdf"),
+        "render": mod.get_kernel("render_sdf"),
+    }
+    return dev, stream, kernels
+
+
+def make_volume_array():
+    """Allocate the 3D SDF volume. Single-channel float, surface-capable."""
+    return Array.from_descriptor(
+        shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+
+
+def make_volume_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.
+
+    Normalized coords let the kernel sample as (u, v, w) in [0, 1]; CLAMP at
+    the boundaries matches the rendering logic that bails out as soon as the
+    march leaves the volume's [-1, 1]^3 box, so out-of-range sampling never
+    pollutes a real hit.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def bake_volume(stream, kernels, arr):
+    """Run the one-shot bake kernel that fills the volume with the SDF.
+
+    The SurfaceObject lives only for the duration of this call; once the bake
+    is enqueued and the kernel has captured the bindless handle into its
+    arguments, we sync the stream before letting the SurfaceObject close.
+    The Array itself outlives this scope -- it's the long-lived backing store
+    for the render-loop TextureObject.
+    """
+    with SurfaceObject.from_array(arr) as bake_surf:
+        block = (8, 8, 8)
+        grid = (
+            (VOLUME_SIZE + block[0] - 1) // block[0],
+            (VOLUME_SIZE + block[1] - 1) // block[1],
+            (VOLUME_SIZE + block[2] - 1) // block[2],
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["bake"],
+            np.uint64(bake_surf.handle),
+            np.int32(VOLUME_SIZE),
+        )
+        # Synchronize before the SurfaceObject context exits so the bindless
+        # handle is still valid while the kernel runs.
+        stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core 3D Array - SDF Volume Ray-Marcher",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: shader, fullscreen quad, empty texture.
+
+    Not CUDA-specific; identical to the other gl_interop_* examples.
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the 3D SDF volume and bake it once ---
+    #     The Array is the long-lived backing store; it must outlive the
+    #     render loop. The SurfaceObject is only needed for the one-shot bake
+    #     and is closed before we ever bind a TextureObject to the same Array.
+    arr = make_volume_array()
+    bake_volume(stream, kernels, arr)
+
+    # --- Step 3: Bind the volume as a trilinear TextureObject ---
+    #     LINEAR + CLAMP + normalized_coords gives us free hardware trilinear
+    #     filtering, which is exactly what we want for both the SDF samples
+    #     in the ray march and the normal-finite-difference samples.
+    volume_tex = make_volume_texture(arr)
+
+    # --- Step 4: Open a window and set up the CUDA/GL bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # Camera is orbit-style: yaw and pitch are angles, dist is the orbit
+    # radius. The render kernel turns these into a (origin, basis) and
+    # constructs per-pixel rays itself.
+    cam = {
+        "yaw": RESET_YAW,
+        "pitch": RESET_PITCH,
+        "dist": RESET_DIST,
+    }
+    frame_count = [0]
+    fps_time = [time.monotonic()]
+    last_fps = [0.0]
+    last_frame_ms = [0.0]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the ray-march kernel. The camera params are passed
+            #     as scalars; the kernel computes the orbit eye position and
+            #     per-pixel ray direction itself.
+            launch(
+                stream,
+                config,
+                kernels["render"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(volume_tex.handle),
+                np.float32(cam["yaw"]),
+                np.float32(cam["pitch"]),
+                np.float32(cam["dist"]),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 0.5:
+            last_fps[0] = frame_count[0] / (now - fps_time[0])
+            last_frame_ms[0] = 1000.0 / last_fps[0] if last_fps[0] > 0 else 0.0
+            frame_count[0] = 0
+            fps_time[0] = now
+            window.set_caption(
+                "cuda.core 3D Array - SDF Volume Ray-Marcher  "
+                f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} "
+                f"dist={cam['dist']:.2f}  "
+                f"{last_fps[0]:.0f} FPS  {last_frame_ms[0]:.2f} ms/frame"
+            )
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+        # Left-click drag orbits the camera. dx -> yaw (sign convention chosen
+        # so that dragging right rotates the scene right); dy -> pitch (drag
+        # up tilts the camera up).
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        ORBIT_SCALE = 0.005
+        cam["yaw"] += dx * ORBIT_SCALE
+        cam["pitch"] += dy * ORBIT_SCALE
+        # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)).
+        if cam["pitch"] < PITCH_MIN:
+            cam["pitch"] = PITCH_MIN
+        elif cam["pitch"] > PITCH_MAX:
+            cam["pitch"] = PITCH_MAX
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # Scroll wheel zoom: geometric so each tick feels uniform regardless
+        # of current distance. Positive scroll_y (wheel up) zooms in.
+        if scroll_y == 0:
+            return
+        cam["dist"] *= 0.9 ** scroll_y
+        if cam["dist"] < DIST_MIN:
+            cam["dist"] = DIST_MIN
+        elif cam["dist"] > DIST_MAX:
+            cam["dist"] = DIST_MAX
+
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.R:
+            cam["yaw"] = RESET_YAW
+            cam["pitch"] = RESET_PITCH
+            cam["dist"] = RESET_DIST
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse construction order. The GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        volume_tex.close()
+        arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Two CUDA C++ kernels are concatenated into one program string so they share
+# a single NVRTC compile.
+#
+#   bake_sdf    -- one thread per voxel. Computes the SDF of an
+#                  "abs(gyroid) - 0.20" surface intersected with a bounding
+#                  sphere, then writes the scalar via surf3Dwrite. NOTE:
+#                  surf3Dwrite's x coordinate is in BYTES, y and z in
+#                  elements -- a classic CUDA gotcha.
+#
+#   render_sdf  -- one thread per screen pixel. Builds the orbit-camera ray,
+#                  fixed-step-marches the volume via tex3D<float> on a trilinear-
+#                  filtered, normalized-coord TextureObject, and shades the
+#                  hit with diffuse + ambient + specular. Misses return a
+#                  sky gradient. Writes RGBA8 directly into the OpenGL PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Small inline helpers. Keeping them __device__ __forceinline__ encourages
+// the compiler to drop them inline and avoids any cross-TU linkage worries.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float dot3(float ax, float ay, float az,
+                                      float bx, float by, float bz) {
+    return ax * bx + ay * by + az * bz;
+}
+
+__device__ __forceinline__ float length3(float x, float y, float z) {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+// --------------------------------------------------------------------------
+// bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere
+//           into a single-channel float 3D Array via a SurfaceObject.
+//
+//   surf is bound to a (size^3, FLOAT32 x 1) Array allocated with
+//   surface_load_store=True.
+//   surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float));
+//   y and z are in elements. Off-by-one on the byte conversion silently
+//   corrupts every other column, so it's worth flagging explicitly.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void bake_sdf(cudaSurfaceObject_t surf, int size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    if (x >= size || y >= size || z >= size) return;
+
+    // Map the voxel index to world-space p in [-1, 1]^3 (texel centers).
+    float fx = ((float)x + 0.5f) / (float)size;
+    float fy = ((float)y + 0.5f) / (float)size;
+    float fz = ((float)z + 0.5f) / (float)size;
+    float px = fx * 2.0f - 1.0f;
+    float py = fy * 2.0f - 1.0f;
+    float pz = fz * 2.0f - 1.0f;
+
+    // Gyroid frequency: 3 cycles across [-1, 1] gives a busy but not noisy
+    // surface at 128^3 resolution. tau = 2 * pi * frequency.
+    const float TAU = 6.2831853071795864f * 3.0f;
+
+    float sx = sinf(px * TAU), cx = cosf(px * TAU);
+    float sy = sinf(py * TAU), cy = cosf(py * TAU);
+    float sz = sinf(pz * TAU), cz = cosf(pz * TAU);
+    float gyroid     = sx * cy + sy * cz + sz * cx;
+    // Slab thickness: the gyroid SDF is non-Lipschitz (its gradient scales
+    // with TAU ~= 19), so the stored values along the surface are dense but
+    // unreliable as a true distance metric. A wider slab (0.20 vs the
+    // canonical 0.05) gives the fixed-step ray marcher in render_sdf enough
+    // hit candidates per ray to render real geometry instead of mostly sky.
+    float sdf_gyroid = fabsf(gyroid) - 0.20f;          // slab around iso-zero
+    float sdf_sphere = length3(px, py, pz) - 0.9f;     // bounding sphere
+    float sdf        = fmaxf(sdf_gyroid, sdf_sphere);  // CSG intersection
+
+    // surf3Dwrite: x in BYTES (cast sizeof to int so 32-bit arithmetic works
+    // even when x is large), y/z in elements.
+    surf3Dwrite<float>(sdf, surf, x * (int)sizeof(float), y, z);
+}
+
+// --------------------------------------------------------------------------
+// SDF sampler: tex3D wants normalized coords in [0, 1]; the volume covers
+// [-1, 1] in world space, so we remap with `(p + 1) * 0.5`. Returns the
+// raw stored SDF (a signed distance in world units).
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float sample_sdf(cudaTextureObject_t tex,
+                                            float px, float py, float pz) {
+    return tex3D<float>(tex,
+                        (px + 1.0f) * 0.5f,
+                        (py + 1.0f) * 0.5f,
+                        (pz + 1.0f) * 0.5f);
+}
+
+// --------------------------------------------------------------------------
+// render_sdf: one thread per screen pixel. Builds the orbit camera, marches
+// a ray through the SDF volume, and writes a shaded RGBA8 pixel to the PBO.
+//
+// Camera math (orbit, look-at origin, world-up (0, 1, 0)):
+//   eye = dist * (cos(pitch)*cos(yaw), sin(pitch), cos(pitch)*sin(yaw))
+//   fwd = normalize(target - eye)         (target = origin)
+//   right = normalize(cross(fwd, up))
+//   up'   = cross(right, fwd)
+//   For a pixel at (u, v) in NDC ([-1, 1] x [-1, 1] with v=1 at the top),
+//   dir = normalize(fwd + tan(fov/2) * (aspect * u * right + v * up'))
+//
+// Ray-march:
+//   Fixed-step march: t += STEP, where STEP is set to roughly one voxel. The
+//   gyroid SDF is non-Lipschitz, which makes classical sphere tracing
+//   (t += sdf(p)) overshoot through thin slabs and miss almost every ray. A
+//   uniform voxel-sized step is robust and cheap because the SDF is just a
+//   tex3D lookup. We declare a HIT when sdf < HIT_EPS.
+//
+// Bounds bail: outside the [-1, 1]^3 box, return the sky.
+// Normal: 6-sample central differences with eps ~ 1.5/VOLUME_SIZE so the
+//         offsets are just over one voxel apart -- short enough to capture
+//         local surface direction, long enough that trilinear filtering
+//         actually moves the result.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_sdf(unsigned char* output,
+                int width,
+                int height,
+                cudaTextureObject_t tex,
+                float yaw,
+                float pitch,
+                float dist) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // ---- Build the orbit camera basis ----------------------------------
+    float cp = cosf(pitch), sp = sinf(pitch);
+    float cy = cosf(yaw),   sy = sinf(yaw);
+
+    // Eye on a sphere of radius `dist` around the origin.
+    float ex = dist * cp * cy;
+    float ey = dist * sp;
+    float ez = dist * cp * sy;
+
+    // fwd = normalize(target - eye), target = origin -> fwd = -eye / |eye|.
+    float fl = length3(ex, ey, ez);
+    // Guard against the (clamped) dist being zero (not reachable, but cheap).
+    if (fl < 1e-6f) fl = 1e-6f;
+    float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl;
+
+    // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0).
+    // cross((fx,fy,fz), (0,1,0)) = (fy*0 - fz*1, fz*0 - fx*0, fx*1 - fy*0)
+    //                            = (-fz, 0, fx)
+    float rx = -fz;
+    float ry = 0.0f;
+    float rz = fx;
+    float rl = length3(rx, ry, rz);
+    if (rl < 1e-6f) rl = 1e-6f;
+    rx /= rl; ry /= rl; rz /= rl;
+
+    // up' = cross(right, fwd). With right purely in the xz-plane, this is a
+    // proper orthonormal up; recompute to keep the basis consistent.
+    float ux = ry * fz - rz * fy;
+    float uy = rz * fx - rx * fz;
+    float uz = rx * fy - ry * fx;
+
+    // ---- Per-pixel ray direction ---------------------------------------
+    // NDC with v=1 at the TOP. With our PBO layout (y=0 written first ->
+    // ends up at the bottom of the on-screen texture courtesy of the GL
+    // shader's [0, 1] texcoord), v = 2*v_norm - 1 already maps row 0 of the
+    // PBO to v = -1 (bottom of the image), which matches the camera's
+    // up'-axis convention. No flip needed.
+    float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width  - 1.0f;
+    float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f;
+
+    const float FOV_Y    = 0.7853981633974483f;        // 45 degrees
+    const float TAN_HALF = 0.41421356237309515f;       // tanf(FOV_Y / 2)
+    float aspect = (float)width / (float)height;
+
+    float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux;
+    float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy;
+    float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz;
+    float dl = length3(dx, dy, dz);
+    if (dl < 1e-6f) dl = 1e-6f;
+    dx /= dl; dy /= dl; dz /= dl;
+
+    // ---- Ray vs. the [-1, 1]^3 box (slab method) -----------------------
+    // The camera always sits outside the volume (DIST_MIN >= 1.2 and the
+    // orbit puts at least one component of the eye outside [-1, 1] for
+    // typical framings), so we must first advance `t` to the AABB entry
+    // before any in-volume sampling is meaningful. tNear is the entry
+    // distance (clamped to >= 0 so we don't march backwards if the eye is
+    // inside the box for some configuration); tFar is the exit distance.
+    // If the slab interval is empty (tNear > tFar), the ray misses outright.
+    float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f));
+    float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f));
+    float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f));
+    float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx;
+    float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy;
+    float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz;
+    float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z));
+    float tFar  = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z));
+
+    bool  hit = false;
+    float hx = 0.0f, hy = 0.0f, hz = 0.0f;
+
+    if (tFar > fmaxf(tNear, 0.0f)) {
+        // ---- Fixed-step march through the SDF volume from the AABB entry
+        // Sphere tracing relies on a Lipschitz-1 SDF: the magnitude of the
+        // sample tells you a safe distance you can step without crossing
+        // the surface. But the gyroid SDF here, |sx*cy + sy*cz + sz*cx|
+        // - 0.20, has a gradient scaling with TAU ~= 19, so the stored
+        // magnitude vastly over-reports the true distance. Sphere tracing
+        // would routinely overshoot thin slab regions, leaving most rays
+        // missing geometry that's actually there. A fixed-step march is
+        // cheap (the SDF is just a tex3D lookup) and robust: each step
+        // advances by one voxel, so any positive crossing of the iso-zero
+        // surface lands inside a thin window where HIT_EPS catches it.
+        //
+        // 2 worldspace units / 256 steps = ~0.008 / step, slightly under
+        // one voxel at 128^3 resolution.
+        const int   MAX_STEPS = 256;
+        const float STEP      = 1.0f / 128.0f;
+        const float HIT_EPS   = 1.0e-3f;
+        // Bias slightly inside the box so the very first sample isn't on
+        // the boundary (CLAMP addressing makes the boundary sample valid,
+        // but starting just inside avoids one wasted iteration).
+        float t = fmaxf(tNear, 0.0f) + 1e-4f;
+        float t_exit = tFar;
+
+        #pragma unroll 1
+        for (int i = 0; i < MAX_STEPS; ++i) {
+            float pxw = ex + t * dx;
+            float pyw = ey + t * dy;
+            float pzw = ez + t * dz;
+
+            float s = sample_sdf(tex, pxw, pyw, pzw);
+            if (s < HIT_EPS) {
+                hit = true;
+                hx = pxw; hy = pyw; hz = pzw;
+                break;
+            }
+            t += STEP;
+            if (t > t_exit) break;
+        }
+    }
+
+    // ---- Shade -----------------------------------------------------------
+    float r, g, b;
+    if (hit) {
+        // Central-difference normal in world space. Each sample step is
+        // ~1.17 voxels: short enough to capture local geometry, long enough
+        // that trilinear filtering meaningfully moves the result.
+        const float NEPS = 1.5f / 128.0f;
+        float nx = sample_sdf(tex, hx + NEPS, hy, hz) -
+                   sample_sdf(tex, hx - NEPS, hy, hz);
+        float ny = sample_sdf(tex, hx, hy + NEPS, hz) -
+                   sample_sdf(tex, hx, hy - NEPS, hz);
+        float nz = sample_sdf(tex, hx, hy, hz + NEPS) -
+                   sample_sdf(tex, hx, hy, hz - NEPS);
+        float nl = length3(nx, ny, nz);
+        if (nl < 1e-6f) nl = 1e-6f;
+        nx /= nl; ny /= nl; nz /= nl;
+
+        // Fixed key light (normalized world direction).
+        const float LX = 0.5773502691896258f;          // (1,1,-1)/sqrt(3)
+        const float LY = 0.5773502691896258f;
+        const float LZ = -0.5773502691896258f;
+        float diff = fmaxf(0.0f, dot3(nx, ny, nz, LX, LY, LZ));
+
+        // Specular: Blinn-Phong half-vector exponent. View dir = -ray dir.
+        float vx = -dx, vy = -dy, vz = -dz;
+        float hx2 = LX + vx, hy2 = LY + vy, hz2 = LZ + vz;
+        float hl  = length3(hx2, hy2, hz2);
+        if (hl < 1e-6f) hl = 1e-6f;
+        hx2 /= hl; hy2 /= hl; hz2 /= hl;
+        float ndoth = fmaxf(0.0f, dot3(nx, ny, nz, hx2, hy2, hz2));
+        float spec = powf(ndoth, 32.0f);
+
+        // Base albedo varies with the hit position so the gyroid lattice
+        // reads as a single material with smooth variation, not flat plastic.
+        float base_r = 0.55f + 0.30f * nx;
+        float base_g = 0.50f + 0.30f * ny;
+        float base_b = 0.70f + 0.30f * nz;
+
+        const float AMBIENT = 0.18f;
+        r = base_r * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        g = base_g * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        b = base_b * (AMBIENT + 0.82f * diff) + 0.7f * spec;
+    } else {
+        // Sky: dark blue at the top, near-black at the bottom. The PBO's row
+        // 0 is the bottom of the on-screen image (see the v_ndc comment),
+        // so we use the y coordinate of the ray direction (close to v_ndc
+        // in screen space) for the gradient.
+        float sky = 0.5f * (dy + 1.0f);                // [0, 1] roughly
+        sky = clampf(sky, 0.0f, 1.0f);
+        r = 0.02f + 0.06f * sky;
+        g = 0.03f + 0.10f * sky;
+        b = 0.05f + 0.20f * sky;
+    }
+
+    r = clampf(r, 0.0f, 1.0f);
+    g = clampf(g, 0.0f, 1.0f);
+    b = clampf(b, 0.0f, 1.0f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py
new file mode 100644
index 00000000000..82c880a8943
--- /dev/null
+++ b/cuda_core/examples/gl_interop_texture_filter.py
@@ -0,0 +1,607 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.TextureObject hardware filtering by
+# comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same
+# source CUDA Array. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to back two TextureObjects with the SAME CUDA Array and observe the
+# difference between POINT (nearest-texel) and LINEAR (bilinear) filtering
+# under user-controlled zoom and pan.  Also shows how the address mode
+# (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at
+# creation time, so changing it at runtime means rebuilding the textures.
+#
+# How it works
+# ============
+# A single 256x256 RGBA8 Array holds a procedurally-generated test pattern
+# (high-contrast checkerboard, diagonals, gradient stripe).  Two
+# TextureObjects are built on top of that Array:
+#
+#       Array (256x256 RGBA UINT8)
+#       /                       \
+#   tex_point                  tex_linear
+#   FilterMode.POINT           FilterMode.LINEAR
+#   AddressMode.WRAP           AddressMode.WRAP
+#   ReadMode.NORMALIZED_FLOAT  ReadMode.NORMALIZED_FLOAT
+#
+# Each frame, a single CUDA kernel runs over a 1024x512 OpenGL PBO:
+#
+#   - Left half of the screen samples tex_point.
+#   - Right half samples tex_linear.
+#   - Both halves use the same (zoom, pan) -> texture-space mapping, so the
+#     two views show the same content with different filtering.
+#   - A 2-pixel vertical white line marks the divider.
+#
+# Because ReadMode.NORMALIZED_FLOAT is used, tex2D<float4>() returns each
+# channel as a float in [0, 1]; the kernel multiplies by 255 and writes
+# unsigned bytes back into the PBO.
+#
+# The PBO is then copied to a GL texture and drawn on a fullscreen quad,
+# identical to the plasma example.
+#
+# What you should see
+# ===================
+# A 1024x512 window split down the middle.  The left half (POINT) shows
+# blocky / pixelated magnification; the right half (LINEAR) shows smooth
+# bilinear interpolation.  Drag with the left mouse button to pan,
+# scroll to zoom, press M to cycle the texture address mode, press R to
+# reset, Escape or close the window to exit.  The current address mode
+# and FPS are shown in the window title.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and source-image dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 512
+SRC_W = 256
+SRC_H = 256
+
+# Address modes cycled by pressing the M key.
+ADDRESS_MODES = (
+    AddressMode.WRAP,
+    AddressMode.CLAMP,
+    AddressMode.MIRROR,
+    AddressMode.BORDER,
+)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL.  If you're here to learn about
+# TextureObject filtering, the most interesting parts are in main() and in
+# make_pattern() / make_textures(); everything else is the same kind of
+# CUDA-GL interop boilerplate used by gl_interop_plasma.py.
+# ============================================================================
+
+
+def make_pattern(width, height):
+    """Build an RGBA8 test pattern that makes POINT vs LINEAR obvious.
+
+    Layout (height, width, 4) of dtype uint8.  Channels are R, G, B, A.
+    The pattern contains:
+      - 8x8 black/white checkerboard (high-frequency)
+      - Two diagonal red lines (1px wide)
+      - Horizontal blue->green gradient strip near y = height/4
+      - A pair of thin horizontal rectangles ("text-like" blocks)
+    """
+    img = np.zeros((height, width, 4), dtype=np.uint8)
+
+    # Checkerboard (black / white) at 8x8 cells.
+    ys = np.arange(height)[:, None]
+    xs = np.arange(width)[None, :]
+    cell = ((xs // 8) + (ys // 8)) & 1
+    white = np.broadcast_to(cell[..., None].astype(np.uint8) * 255, (height, width, 3))
+    img[..., :3] = white
+    img[..., 3] = 255
+
+    # Two diagonal red lines.
+    diag1 = (xs == ys)
+    diag2 = (xs == (width - 1 - ys))
+    red_mask = diag1 | diag2
+    img[red_mask] = (255, 0, 0, 255)
+
+    # Horizontal gradient strip (blue -> green) ~ 8 rows tall at y ~ height/4.
+    g_y = height // 4
+    g_h = max(4, height // 32)
+    grad = np.linspace(0, 255, width, dtype=np.uint8)
+    for row in range(g_y, min(g_y + g_h, height)):
+        img[row, :, 0] = 0
+        img[row, :, 1] = grad             # G ramps up
+        img[row, :, 2] = 255 - grad       # B ramps down
+        img[row, :, 3] = 255
+
+    # Two "text-like" thin rectangles, alternating bright/dim.
+    def fill_rect(y0, y1, x0, x1, rgba):
+        img[y0:y1, x0:x1] = rgba
+
+    bar_y = (3 * height) // 4
+    fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255))
+    fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8,
+              (0, 255, 255, 255))
+
+    return np.ascontiguousarray(img)
+
+
+def make_textures(array, address_mode):
+    """Build (tex_point, tex_linear) on the given Array with the given mode.
+
+    The address mode is baked into the descriptor at cuTexObjectCreate time, so
+    we recreate both textures whenever the user cycles the mode.  Caller owns
+    the returned objects and must close() them.
+    """
+    res_desc = ResourceDescriptor.from_array(array)
+
+    point_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    linear_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    tex_point = TextureObject.from_descriptor(
+        resource=res_desc, texture_descriptor=point_desc
+    )
+    tex_linear = TextureObject.from_descriptor(
+        resource=res_desc, texture_descriptor=linear_desc
+    )
+    return tex_point, tex_linear
+
+
+def setup_cuda(kernel_source):
+    """Compile the CUDA kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("split_screen_sample",))
+    kernel = mod.get_kernel("split_screen_sample")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="TextureObject Filter Comparison - POINT vs LINEAR",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad, identical in
+    structure to the plasma example.  Returns (shader_program, vao_id, tex_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles).  Each vertex: x, y, s, t.
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    # Empty GL texture; filled each frame from the PBO.
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # Use nearest filtering on the display texture so the example's own
+    # POINT/LINEAR comparison is not muddied by GL's sampler.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda(KERNEL_SOURCE)
+
+    # The hardware-texture path needs at least compute capability 3.x
+    # (it's available essentially everywhere modern, but check anyway so the
+    # failure is friendly).
+    if dev.compute_capability.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got {dev.compute_capability.major}.{dev.compute_capability.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, quad, display texture) ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _nbytes = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the source Array and upload the test pattern ---
+    #     The Array lives for the entire program, so we use a `with` block.
+    #     Inside it we create / re-create two TextureObjects whenever the
+    #     user cycles the address mode.
+    with Array.from_descriptor(
+        shape=(SRC_W, SRC_H),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    ) as arr:
+        pattern = make_pattern(SRC_W, SRC_H)
+        # Sanity: 256 * 256 * 4 bytes = 262144.
+        assert pattern.nbytes == arr.size_bytes, (
+            f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})"
+        )
+        arr.copy_from(pattern, stream=stream)
+        stream.sync()  # upload must finish before kernel reads
+
+        # --- Step 7: Build initial POINT + LINEAR textures (WRAP mode). ---
+        # We can't use a `with` block here because the address mode is baked
+        # into the descriptor at creation time: cycling modes means closing
+        # and recreating these objects.  We instead hold them in mutable
+        # closure state and release them in on_close().
+        tex_state = {
+            "mode_idx": 0,
+            "tex_point": None,
+            "tex_linear": None,
+        }
+
+        def rebuild_textures():
+            # Close previous textures (if any) before creating new ones so we
+            # don't leak handles when cycling the address mode.
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+            mode = ADDRESS_MODES[tex_state["mode_idx"]]
+            tp, tl = make_textures(arr, mode)
+            tex_state["tex_point"] = tp
+            tex_state["tex_linear"] = tl
+
+        rebuild_textures()
+
+        # --- Step 8: View state (zoom + pan), tight initial framing. ---
+        # zoom = pixels_per_texel.  zoom=3 -> roughly 3x magnification, which
+        # makes POINT vs LINEAR obvious without any user input.
+        view = {
+            "zoom": 3.0,
+            "pan_x": SRC_W * 0.5,
+            "pan_y": SRC_H * 0.5,
+            "drag": False,
+        }
+
+        def reset_view():
+            view["zoom"] = 3.0
+            view["pan_x"] = SRC_W * 0.5
+            view["pan_y"] = SRC_H * 0.5
+
+        # --- Step 9: Render loop ---
+        start_time = time.monotonic()
+        frame_count = 0
+        fps_time = start_time
+
+        def current_mode_name():
+            return ADDRESS_MODES[tex_state["mode_idx"]].name
+
+        @window.event
+        def on_draw():
+            nonlocal frame_count, fps_time
+            window.clear()
+
+            # (a) Map the PBO so CUDA can write to it.
+            with resource.map(stream=stream) as buf:
+                # (b) Launch the split-screen sampling kernel.
+                launch(
+                    stream,
+                    config,
+                    kernel,
+                    np.uint64(tex_state["tex_point"].handle),
+                    np.uint64(tex_state["tex_linear"].handle),
+                    buf.handle,
+                    np.int32(WIDTH),
+                    np.int32(HEIGHT),
+                    np.float32(view["zoom"]),
+                    np.float32(view["pan_x"]),
+                    np.float32(view["pan_y"]),
+                    np.int32(SRC_W),
+                    np.int32(SRC_H),
+                )
+            # (c) Unmap happens automatically when the `with` block exits.
+
+            # (d) PBO -> GL texture (GPU-to-GPU).
+            copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+            # (e) Draw the texture to the screen.
+            draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+            frame_count += 1
+            now = time.monotonic()
+            if now - fps_time >= 1.0:
+                fps = frame_count / (now - fps_time)
+                window.set_caption(
+                    f"TextureObject Filter - POINT | LINEAR  "
+                    f"[address={current_mode_name()}, zoom={view['zoom']:.2f}x, "
+                    f"{fps:.0f} FPS]"
+                )
+                frame_count = 0
+                fps_time = now
+
+        # --- Mouse: drag to pan, scroll to zoom ------------------------------
+        @window.event
+        def on_mouse_press(x, y, button, modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = True
+
+        @window.event
+        def on_mouse_release(x, y, button, modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = False
+
+        @window.event
+        def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+            if not (buttons & pyglet.window.mouse.LEFT):
+                return
+            # Pyglet dy is screen-up-positive; texture y is texel-down-positive.
+            # One screen pixel = 1/zoom texels in source space.
+            view["pan_x"] -= dx / view["zoom"]
+            view["pan_y"] += dy / view["zoom"]
+
+        @window.event
+        def on_mouse_scroll(x, y, scroll_x, scroll_y):
+            # Geometric zoom; clamp to a sensible range.
+            factor = 1.1 ** scroll_y
+            new_zoom = view["zoom"] * factor
+            view["zoom"] = max(0.1, min(32.0, new_zoom))
+
+        # --- Keyboard: M cycles address mode, R resets view ------------------
+        @window.event
+        def on_key_press(symbol, modifiers):
+            key = pyglet.window.key
+            if symbol == key.M:
+                tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES)
+                rebuild_textures()
+            elif symbol == key.R:
+                reset_view()
+            elif symbol == key.ESCAPE:
+                window.close()
+
+        @window.event
+        def on_close():
+            # Release CUDA resources in reverse order of creation.
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+                tex_state["tex_linear"] = None
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+                tex_state["tex_point"] = None
+            resource.close()
+
+        pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE samples the same source Array through two TextureObjects
+# (POINT vs LINEAR) and writes RGBA8 pixels into the PBO.  ReadMode.
+# NORMALIZED_FLOAT means tex2D<float4>() returns each channel in [0, 1];
+# the kernel scales by 255 and writes unsigned bytes back out.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws
+# a texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+extern "C" __global__
+void split_screen_sample(cudaTextureObject_t point_tex,
+                         cudaTextureObject_t linear_tex,
+                         unsigned char* out,
+                         int w, int h,
+                         float zoom,
+                         float pan_x, float pan_y,
+                         int src_w, int src_h) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= w || y >= h) return;
+
+    int half_w = w / 2;
+
+    // 2-pixel-wide white separator down the middle.
+    if (x == half_w || x == half_w - 1) {
+        int idx = (y * w + x) * 4;
+        out[idx + 0] = 255;
+        out[idx + 1] = 255;
+        out[idx + 2] = 255;
+        out[idx + 3] = 255;
+        return;
+    }
+
+    // Each half of the screen samples the same (src_x, src_y) so the two
+    // sides line up visually for an apples-to-apples filter comparison.
+    float local_x = (x < half_w) ? (float)x : (float)(x - half_w);
+
+    // (src_x, src_y) in source-texture pixel coordinates.  Non-normalized
+    // coords are used, so coordinate (i + 0.5, j + 0.5) selects texel (i, j).
+    float src_x = pan_x + (local_x - (float)half_w * 0.5f) / zoom;
+    float src_y = pan_y + ((float)y     - (float)h      * 0.5f) / zoom;
+
+    float4 sample;
+    if (x < half_w) {
+        sample = tex2D<float4>(point_tex,  src_x, src_y);
+    } else {
+        sample = tex2D<float4>(linear_tex, src_x, src_y);
+    }
+
+    int idx = (y * w + x) * 4;
+    out[idx + 0] = (unsigned char)(sample.x * 255.0f);
+    out[idx + 1] = (unsigned char)(sample.y * 255.0f);
+    out[idx + 2] = (unsigned char)(sample.z * 255.0f);
+    out[idx + 3] = (unsigned char)(sample.w * 255.0f);
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 31b9f86e0a1..e1666114cc9 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -82,6 +82,15 @@ def has_recent_memory_pool_support() -> bool:
 SYSTEM_REQUIREMENTS = {
     "memory_pool_resources.py": has_recent_memory_pool_support,
     "gl_interop_plasma.py": has_display,
+    "gl_interop_fire.py": has_display,
+    "gl_interop_image_show.py": has_display,
+    "gl_interop_lenia.py": has_display,
+    "gl_interop_mandelbrot.py": has_display,
+    "gl_interop_mipmap_lod.py": has_display,
+    "gl_interop_ocean.py": has_display,
+    "gl_interop_reaction_diffusion.py": has_display,
+    "gl_interop_sdf_volume.py": has_display,
+    "gl_interop_texture_filter.py": has_display,
     "pytorch_example.py": lambda: (
         has_compute_capability_9_or_higher() and is_x86_64()
     ),  # PyTorch only provides CUDA support for x86_64

From 1432c0a39e4184087b238a2f0263a270ed2c063c Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 9 Jun 2026 18:01:11 -0700
Subject: [PATCH 11/17] cuda.core: rename Array->CUDAArray,
 surface_load_store->is_surface_load_store (#467)

Applies design decisions resolved in #2188:
- #1: rename public `Array` class to `CUDAArray` (PEP 8 CapWords; aligns with
  CuPy's `cupy.cuda.texture.CUDAarray`). `ArrayFormat` left unchanged (open detail).
- #6: rename the bool property `surface_load_store` -> `is_surface_load_store`
  to follow the repo's `is_<x>` convention. Constructor keyword `surface_load_store=`
  kept as-is (open detail). Private field `_surface_load_store` unchanged.

GL interop examples retained (decision #7 reversed) and updated to the new names.
Verified: cuda.core builds in the cu12 env and the renamed public API imports
(`CUDAArray`, `is_surface_load_store` present; old `Array`/`surface_load_store` gone).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py               |  2 +-
 cuda_core/cuda/core/_array.pxd                |  6 +-
 cuda_core/cuda/core/_array.pyx                | 22 ++---
 cuda_core/cuda/core/_mipmapped_array.pyx      | 24 ++---
 cuda_core/cuda/core/_surface.pxd              |  2 +-
 cuda_core/cuda/core/_surface.pyx              | 18 ++--
 cuda_core/cuda/core/_texture.pxd              |  2 +-
 cuda_core/cuda/core/_texture.pyx              | 20 ++---
 cuda_core/docs/source/api.rst                 |  6 +-
 cuda_core/examples/gl_interop_fire.py         | 32 +++----
 cuda_core/examples/gl_interop_image_show.py   | 20 ++---
 cuda_core/examples/gl_interop_lenia.py        | 20 ++---
 cuda_core/examples/gl_interop_mandelbrot.py   | 24 ++---
 cuda_core/examples/gl_interop_mipmap_lod.py   |  6 +-
 cuda_core/examples/gl_interop_ocean.py        | 24 ++---
 .../examples/gl_interop_reaction_diffusion.py | 20 ++---
 cuda_core/examples/gl_interop_sdf_volume.py   | 28 +++---
 .../examples/gl_interop_texture_filter.py     | 22 ++---
 cuda_core/examples/texture_sample.py          | 10 +--
 cuda_core/tests/test_texture_surface.py       | 88 +++++++++----------
 20 files changed, 198 insertions(+), 198 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 9769a39977f..c3bb105bc17 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -78,7 +78,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResource,
     WorkqueueResourceOptions,
 )
-from cuda.core._array import Array, ArrayFormat
+from cuda.core._array import CUDAArray, ArrayFormat
 from cuda.core._mipmapped_array import MipmappedArray
 from cuda.core._texture import (
     AddressMode,
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
index 73529cac48e..25069a81eb9 100644
--- a/cuda_core/cuda/core/_array.pxd
+++ b/cuda_core/cuda/core/_array.pxd
@@ -6,7 +6,7 @@ from libc.stdint cimport intptr_t
 from cuda.bindings cimport cydriver
 
 
-cdef class Array:
+cdef class CUDAArray:
 
     cdef:
         cydriver.CUarray _handle
@@ -18,8 +18,8 @@ cdef class Array:
         bint _owning
         bint _surface_load_store
         # Optional strong reference to a parent owner (e.g. a MipmappedArray
-        # whose level this Array views). When set, the parent must outlive
-        # this Array because the underlying CUarray belongs to the parent.
+        # whose level this CUDAArray views). When set, the parent must outlive
+        # this CUDAArray because the underlying CUarray belongs to the parent.
         object _parent_ref
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 7d02dcd5d21..36920d61156 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -21,7 +21,7 @@ import enum
 
 
 class ArrayFormat(enum.IntEnum):
-    """Element format for a :class:`Array` allocation.
+    """Element format for a :class:`CUDAArray` allocation.
 
     Mirrors ``CUarray_format`` from the CUDA driver API.
     """
@@ -49,7 +49,7 @@ _FORMAT_ELEM_SIZE = {
 
 
 cdef void _fill_array_endpoint(
-    cydriver.CUDA_MEMCPY3D* p, Array arr, bint is_src
+    cydriver.CUDA_MEMCPY3D* p, CUDAArray arr, bint is_src
 ) noexcept:
     """Populate the src or dst array fields of a CUDA_MEMCPY3D struct."""
     if is_src:
@@ -156,7 +156,7 @@ cdef int _fill_linear_endpoint(
     )
 
 
-cdef _copy3d(Array arr, object other, object stream, bint to_array):
+cdef _copy3d(CUDAArray arr, object other, object stream, bint to_array):
     """Issue a full-array async 3D memcpy between ``arr`` and ``other``.
 
     Direction is determined by ``to_array``: True copies *into* arr, False
@@ -198,7 +198,7 @@ cdef _copy3d(Array arr, object other, object stream, bint to_array):
             cpython.PyBuffer_Release(&pybuf)
 
 
-cdef class Array:
+cdef class CUDAArray:
     """An opaque, hardware-laid-out GPU allocation for texture/surface access.
 
     Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
@@ -213,7 +213,7 @@ cdef class Array:
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError(
-            "Array cannot be instantiated directly. Use Array.from_descriptor()."
+            "CUDAArray cannot be instantiated directly. Use CUDAArray.from_descriptor()."
         )
 
     @classmethod
@@ -236,7 +236,7 @@ cdef class Array:
 
         Returns
         -------
-        Array
+        CUDAArray
         """
         if not isinstance(format, ArrayFormat):
             raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
@@ -253,7 +253,7 @@ cdef class Array:
             if dim < 1:
                 raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
 
-        cdef Array self = cls.__new__(cls)
+        cdef CUDAArray self = cls.__new__(cls)
         self._owning = True
         self._shape = shape_t
         self._format = <cydriver.CUarray_format><int>format
@@ -303,7 +303,7 @@ cdef class Array:
         :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
         format, and channel count are queried from the driver.
         """
-        cdef Array self = cls.__new__(cls)
+        cdef CUDAArray self = cls.__new__(cls)
         self._handle = <cydriver.CUarray><void*>handle
         self._owning = owning
         self._context = _get_current_context_ptr()
@@ -357,7 +357,7 @@ cdef class Array:
         return Device(self._device_id)
 
     @property
-    def surface_load_store(self):
+    def is_surface_load_store(self):
         """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
         and can be bound as a :class:`SurfaceObject`."""
         return self._surface_load_store
@@ -412,7 +412,7 @@ cdef class Array:
         cdef cydriver.CUarray h = self._handle
         cdef bint owning = self._owning
         self._handle = NULL
-        # Drop the parent reference (if any) so a non-owning level Array
+        # Drop the parent reference (if any) so a non-owning level CUDAArray
         # stops pinning its MipmappedArray after close().
         self._parent_ref = None
         if h != NULL and owning:
@@ -433,7 +433,7 @@ cdef class Array:
 
     def __repr__(self):
         return (
-            f"Array(shape={self._shape}, "
+            f"CUDAArray(shape={self._shape}, "
             f"format={ArrayFormat(self._format).name}, "
             f"num_channels={self._num_channels})"
         )
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
index c149d907f62..a8a308933f0 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyx
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -8,7 +8,7 @@ from libc.stdint cimport intptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core._array cimport Array
+from cuda.core._array cimport CUDAArray
 from cuda.core._array import ArrayFormat
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
@@ -22,9 +22,9 @@ cdef class MipmappedArray:
 
     Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
     allocation accessible only via a :class:`TextureObject` (or by retrieving
-    the level's :class:`Array` and binding it as a :class:`SurfaceObject`).
+    the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`).
     Destroying the :class:`MipmappedArray` destroys all level arrays
-    implicitly, so the :class:`Array` instances returned by :meth:`get_level`
+    implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level`
     are non-owning and hold a strong reference back to their parent.
 
     Construct via :meth:`from_descriptor`.
@@ -118,7 +118,7 @@ cdef class MipmappedArray:
         return self
 
     def get_level(self, level):
-        """Return a non-owning :class:`Array` view of the given mip level.
+        """Return a non-owning :class:`CUDAArray` view of the given mip level.
 
         Parameters
         ----------
@@ -127,10 +127,10 @@ cdef class MipmappedArray:
 
         Returns
         -------
-        Array
-            A non-owning :class:`Array` wrapping the level's ``CUarray``.
+        CUDAArray
+            A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``.
             The :class:`MipmappedArray` is kept alive for the lifetime of the
-            returned :class:`Array`; the underlying storage is released only
+            returned :class:`CUDAArray`; the underlying storage is released only
             when this :class:`MipmappedArray` is destroyed.
         """
         lvl = int(level)
@@ -148,13 +148,13 @@ cdef class MipmappedArray:
                 cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level)
             )
 
-        # Wrap as a non-owning Array; the level's underlying CUarray belongs
+        # Wrap as a non-owning CUDAArray; the level's underlying CUarray belongs
         # to this MipmappedArray and must not be destroyed independently.
-        arr = Array._from_handle(
+        arr = CUDAArray._from_handle(
             <intptr_t>level_handle, False, device_id=self._device_id
         )
         # Strong ref back to the parent so the mipmap outlives the level view.
-        (<Array>arr)._parent_ref = self
+        (<CUDAArray>arr)._parent_ref = self
         return arr
 
     @property
@@ -183,7 +183,7 @@ cdef class MipmappedArray:
         return int(self._num_levels)
 
     @property
-    def surface_load_store(self):
+    def is_surface_load_store(self):
         """True if this mipmap (and each of its levels) was created with
         ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
         return self._surface_load_store
@@ -197,7 +197,7 @@ cdef class MipmappedArray:
     cpdef close(self):
         """Destroy the underlying ``CUmipmappedArray`` if owned.
 
-        After ``close()`` any level :class:`Array` returned by :meth:`get_level`
+        After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level`
         becomes invalid; callers must not access them.
         """
         cdef cydriver.CUmipmappedArray h = self._handle
diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd
index ba7791d5172..dd8548e0a36 100644
--- a/cuda_core/cuda/core/_surface.pxd
+++ b/cuda_core/cuda/core/_surface.pxd
@@ -10,7 +10,7 @@ cdef class SurfaceObject:
 
     cdef:
         cydriver.CUsurfObject _handle
-        object _source_ref      # keep backing Array alive
+        object _source_ref      # keep backing CUDAArray alive
         int _device_id
         intptr_t _context
 
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
index 62cdecc9a01..2fdd43efd74 100644
--- a/cuda_core/cuda/core/_surface.pyx
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -8,7 +8,7 @@ from libc.stdint cimport intptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core._array cimport Array
+from cuda.core._array cimport CUDAArray
 from cuda.core._texture import ResourceDescriptor
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
@@ -24,7 +24,7 @@ cdef class SurfaceObject:
     has no sampling state (no filtering, no addressing modes, no normalization);
     kernels read and write through it using integer pixel coordinates.
 
-    The backing :class:`Array` must have been created with
+    The backing :class:`CUDAArray` must have been created with
     ``surface_load_store=True`` and is kept alive for the lifetime of this
     object to prevent dangling handles.
 
@@ -40,12 +40,12 @@ cdef class SurfaceObject:
 
     @classmethod
     def from_array(cls, array):
-        """Create a surface object directly from an :class:`Array`.
+        """Create a surface object directly from an :class:`CUDAArray`.
 
         The array must have been created with ``surface_load_store=True``.
         """
-        if not isinstance(array, Array):
-            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        if not isinstance(array, CUDAArray):
+            raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}")
         return cls.from_descriptor(resource=ResourceDescriptor.from_array(array))
 
     @classmethod
@@ -55,7 +55,7 @@ cdef class SurfaceObject:
         Parameters
         ----------
         resource : ResourceDescriptor
-            Must wrap an :class:`Array` allocated with
+            Must wrap an :class:`CUDAArray` allocated with
             ``surface_load_store=True``. Linear/pitch2d resources are not
             valid surface backings.
         """
@@ -70,10 +70,10 @@ cdef class SurfaceObject:
                 f"got kind={resource.kind!r}"
             )
 
-        cdef Array arr = <Array>resource.source
-        if not arr.surface_load_store:
+        cdef CUDAArray arr = <CUDAArray>resource.source
+        if not arr.is_surface_load_store:
             raise ValueError(
-                "Array must be created with surface_load_store=True to be "
+                "CUDAArray must be created with surface_load_store=True to be "
                 "bound as a SurfaceObject"
             )
 
diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd
index 4d2d5004069..40725cfe40d 100644
--- a/cuda_core/cuda/core/_texture.pxd
+++ b/cuda_core/cuda/core/_texture.pxd
@@ -10,7 +10,7 @@ cdef class TextureObject:
 
     cdef:
         cydriver.CUtexObject _handle
-        object _source_ref      # keep backing Array (or other resource) alive
+        object _source_ref      # keep backing CUDAArray (or other resource) alive
         object _texture_desc    # original TextureDescriptor for introspection
         int _device_id
         intptr_t _context
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 6ccffcadbb1..aeaa2ace4bb 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -8,7 +8,7 @@ from libc.stdint cimport intptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core._array cimport Array
+from cuda.core._array cimport CUDAArray
 from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
 from cuda.core._memory._buffer cimport Buffer
 from cuda.core._mipmapped_array cimport MipmappedArray
@@ -63,7 +63,7 @@ class ResourceDescriptor:
 
     Construct via the ``from_*`` classmethods:
 
-    - :meth:`from_array` wraps a :class:`Array` (works for both
+    - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
       :class:`TextureObject` and :class:`SurfaceObject`).
     - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
       objects built from a linear resource do not support filtering,
@@ -72,7 +72,7 @@ class ResourceDescriptor:
       Supports filtering and 2D addressing, but only 2D access.
 
     Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
-    require an :class:`Array` allocated with ``surface_load_store=True``.
+    require an :class:`CUDAArray` allocated with ``surface_load_store=True``.
     """
 
     __slots__ = (
@@ -90,9 +90,9 @@ class ResourceDescriptor:
 
     @classmethod
     def from_array(cls, array):
-        """Build a resource descriptor backed by a :class:`Array`."""
-        if not isinstance(array, Array):
-            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        """Build a resource descriptor backed by a :class:`CUDAArray`."""
+        if not isinstance(array, CUDAArray):
+            raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}")
         self = cls.__new__(cls)
         self._kind = "array"
         self._source = array
@@ -110,7 +110,7 @@ class ResourceDescriptor:
 
         Suitable for binding to a :class:`TextureObject` for mipmapped
         sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
-        require a single :class:`Array` level (obtain via
+        require a single :class:`CUDAArray` level (obtain via
         :meth:`MipmappedArray.get_level`).
         """
         if not isinstance(mipmapped_array, _PyMipmappedArray):
@@ -380,7 +380,7 @@ cdef class TextureObject:
     """A bindless texture handle for kernel-side sampled reads.
 
     Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
-    :class:`Array` referenced by the descriptor) is kept alive for the
+    :class:`CUDAArray` referenced by the descriptor) is kept alive for the
     lifetime of this object to prevent dangling handles.
 
     Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
@@ -419,12 +419,12 @@ cdef class TextureObject:
         memset(&tex_desc, 0, sizeof(tex_desc))
 
         # --- Resource descriptor ---
-        cdef Array arr
+        cdef CUDAArray arr
         cdef MipmappedArray mip
         cdef Buffer buf
         cdef intptr_t devptr
         if resource.kind == "array":
-            arr = <Array>resource.source
+            arr = <CUDAArray>resource.source
             res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
             res_desc.res.array.hArray = arr._handle
         elif resource.kind == "mipmapped_array":
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 7c1d33e3393..5d81cdbbdfa 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -163,8 +163,8 @@ Textures and surfaces
 ---------------------
 
 CUDA arrays back bindless texture and surface objects for kernel-side sampled
-reads and typed load/store. :class:`Array` is allocated through
-:meth:`Array.from_descriptor` and bound through a :class:`ResourceDescriptor`
+reads and typed load/store. :class:`CUDAArray` is allocated through
+:meth:`CUDAArray.from_descriptor` and bound through a :class:`ResourceDescriptor`
 factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as
 mipmapped allocations (:class:`MipmappedArray`) are also supported as texture
 backings.
@@ -174,7 +174,7 @@ backings.
 
    :template: autosummary/cyclass.rst
 
-   Array
+   CUDAArray
    MipmappedArray
    ResourceDescriptor
    TextureObject
diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
index c8f2c9165b6..14e241f561a 100644
--- a/cuda_core/examples/gl_interop_fire.py
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -4,10 +4,10 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
 # in combination with GraphicsResource for CUDA/OpenGL interop: a classic
 # "Doom-style" procedural fire effect. A scalar heat field lives on a
-# ping-ponged float CUDA Array; each frame the field is advected upward with a
+# ping-ponged float CUDA CUDAArray; each frame the field is advected upward with a
 # horizontal jitter and a small decay, then colorized through a 1D fire-palette
 # TextureObject straight into an OpenGL PBO. Requires pyglet.
 #
@@ -15,9 +15,9 @@
 
 # What this example teaches
 # =========================
-# - How to combine a 2D float Array (the heat field) and a 1D RGBA8 Array (the
+# - How to combine a 2D float CUDAArray (the heat field) and a 1D RGBA8 CUDAArray (the
 #   color palette) under the same texture/surface API.
-# - How to ping-pong a scalar field via Array + SurfaceObject writes and
+# - How to ping-pong a scalar field via CUDAArray + SurfaceObject writes and
 #   TextureObject reads, similar to the reaction-diffusion example but with a
 #   single channel.
 # - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a
@@ -89,7 +89,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -132,7 +132,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
 # part is there. These helpers exist so that main() reads like a short story
 # instead of a wall of boilerplate.
 # ============================================================================
@@ -198,7 +198,7 @@ def create_window():
     window = pyglet.window.Window(
         WINDOW_WIDTH,
         WINDOW_HEIGHT,
-        caption="cuda.core Array/Texture/Surface - Doom Fire",
+        caption="cuda.core CUDAArray/Texture/Surface - Doom Fire",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -334,13 +334,13 @@ def make_heat_arrays():
     Intensity is an integer in [0, 36] indexing the canonical Doom palette.
     UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1.
     """
-    arr_a = Array.from_descriptor(
+    arr_a = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.UINT8,
         num_channels=1,
         surface_load_store=True,
     )
-    arr_b = Array.from_descriptor(
+    arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.UINT8,
         num_channels=1,
@@ -400,18 +400,18 @@ def build_fire_palette():
 
 
 def make_palette_array_and_texture(stream):
-    """Allocate the 1D RGBA8 palette Array, upload, and bind as a texture.
+    """Allocate the 1D RGBA8 palette CUDAArray, upload, and bind as a texture.
 
     Returns (palette_array, palette_texture). Both must be closed by the
     caller (or used inside `with` blocks).
     """
     palette = build_fire_palette()  # shape (PALETTE_SIZE, 4), uint8
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(PALETTE_SIZE,),
         format=ArrayFormat.UINT8,
         num_channels=4,
     )
-    # 1D Array bytes match a flat (PALETTE_SIZE * 4) uint8 buffer.
+    # 1D CUDAArray bytes match a flat (PALETTE_SIZE * 4) uint8 buffer.
     arr.copy_from(np.ascontiguousarray(palette), stream=stream)
 
     res_desc = ResourceDescriptor.from_array(arr)
@@ -453,7 +453,7 @@ def main():
     # --- Step 5: Register the PBO with CUDA ---
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
-    # --- Step 6: Allocate heat-field Arrays, palette Array, and the four
+    # --- Step 6: Allocate heat-field Arrays, palette CUDAArray, and the four
     #             bindless handles (textures + surfaces). We hold them open
     #             for the lifetime of the window and release in on_close(),
     #             matching the reaction-diffusion example. (Using `with`
@@ -466,7 +466,7 @@ def main():
     surf_a = SurfaceObject.from_array(arr_a)
     surf_b = SurfaceObject.from_array(arr_b)
 
-    # The heat field is born zeroed by Array.from_descriptor. No seed pass.
+    # The heat field is born zeroed by CUDAArray.from_descriptor. No seed pass.
     state = {
         "current": "a",            # which array holds the latest heat field
         "frame_index": 0,           # passed into the step kernel as `t`
@@ -484,7 +484,7 @@ def current_read_write():
     def clear_field():
         """Zero both heat arrays and seed the bottom row at full intensity.
 
-        Array.copy_from is the simplest reset path -- a dedicated clear
+        CUDAArray.copy_from is the simplest reset path -- a dedicated clear
         kernel would be faster but is unnecessary for an interactive demo.
         The bottom row is set to MAX_INTENSITY so the very first frame
         already has a fire source to advect from.
@@ -591,7 +591,7 @@ def on_draw():
             fps = frame_count / (now - fps_time)
             ambient_label = "on" if state["ambient"] else "off"
             window.set_caption(
-                "cuda.core Array/Texture/Surface - Doom Fire"
+                "cuda.core CUDAArray/Texture/Surface - Doom Fire"
                 f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
                 f" ambient {ambient_label})"
             )
diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py
index 4bdd55e1569..53dc3807e28 100644
--- a/cuda_core/examples/gl_interop_image_show.py
+++ b/cuda_core/examples/gl_interop_image_show.py
@@ -6,7 +6,7 @@
 #
 # Minimal "Hello World" for the cuda.core texture/surface stack.
 #
-# Allocates a small `Array`, fills it with a procedural image once, binds it
+# Allocates a small `CUDAArray`, fills it with a procedural image once, binds it
 # as a `TextureObject`, and uses a single CUDA kernel to sample that texture
 # at every screen pixel (with a scale + rotation transform) and write the
 # result into an OpenGL PBO for display.
@@ -18,7 +18,7 @@
 #
 # What this example teaches
 # =========================
-# - Allocate an `Array` and upload data into it with `Array.copy_from`.
+# - Allocate an `CUDAArray` and upload data into it with `CUDAArray.copy_from`.
 # - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`.
 # - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR`
 #   (press F to toggle live).
@@ -29,7 +29,7 @@
 # ============
 #   Startup (once):
 #     +-------------------+   copy_from   +----------+
-#     | host numpy image  | ------------> |  Array   |  (UINT8 RGBA, 64x64)
+#     | host numpy image  | ------------> |  CUDAArray   |  (UINT8 RGBA, 64x64)
 #     +-------------------+               +----+-----+
 #                                              |
 #                                              v
@@ -63,7 +63,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -83,7 +83,7 @@
 # ---------------------------------------------------------------------------
 WIDTH = 640
 HEIGHT = 480
-IMAGE_SIZE = 64  # the source Array is IMAGE_SIZE x IMAGE_SIZE RGBA8
+IMAGE_SIZE = 64  # the source CUDAArray is IMAGE_SIZE x IMAGE_SIZE RGBA8
 
 
 # ============================= Helper functions =============================
@@ -154,7 +154,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core Array + TextureObject - Image Show",
+        caption="cuda.core CUDAArray + TextureObject - Image Show",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -280,8 +280,8 @@ def main():
     pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
-    # --- Step 5: Allocate the source `Array` and upload the test pattern ---
-    arr = Array.from_descriptor(
+    # --- Step 5: Allocate the source `CUDAArray` and upload the test pattern ---
+    arr = CUDAArray.from_descriptor(
         shape=(IMAGE_SIZE, IMAGE_SIZE),
         format=ArrayFormat.UINT8,
         num_channels=4,
@@ -290,7 +290,7 @@ def main():
     arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
     stream.sync()
 
-    # --- Step 6: Bind the Array as a TextureObject (initially POINT) ---
+    # --- Step 6: Bind the CUDAArray as a TextureObject (initially POINT) ---
     state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0}
     tex = make_texture(arr, state["filter"])
 
@@ -345,7 +345,7 @@ def on_draw():
         if now - fps_time >= 1.0:
             fps = frame_count / (now - fps_time)
             window.set_caption(
-                f"cuda.core Array + TextureObject - Image Show "
+                f"cuda.core CUDAArray + TextureObject - Image Show "
                 f"(filter={state['filter'].name}, "
                 f"rotate={'on' if state['rotate'] else 'off'}, "
                 f"{fps:.0f} FPS)"
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
index c1772514a70..4a16689987f 100644
--- a/cuda_core/examples/gl_interop_lenia.py
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
 # in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia
 # continuous cellular automaton is ping-ponged between two CUDA arrays each
 # frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through
@@ -17,10 +17,10 @@
 # What this example teaches
 # =========================
 # - How to drive a wide-radius convolution from a TextureObject configured for
-#   LINEAR + WRAP + normalized coordinates. The same Array is then bound as a
+#   LINEAR + WRAP + normalized coordinates. The same CUDAArray is then bound as a
 #   SurfaceObject for the typed write back, requiring `surface_load_store=True`
 #   at allocation time.
-# - How a single-channel `float` Array differs from the multi-channel layout
+# - How a single-channel `float` CUDAArray differs from the multi-channel layout
 #   used in the Gray-Scott example: `num_channels=1`, `tex2D<float>` reads, and
 #   a 4-byte x-stride in `surf2Dwrite`.
 # - How to host-precompute a normalization constant for a stencil with a
@@ -112,7 +112,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -162,7 +162,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
 # part is there. These helpers exist so that main() reads like a short story
 # instead of a wall of boilerplate.
 # ============================================================================
@@ -255,7 +255,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core Array/Texture/Surface - Lenia",
+        caption="cuda.core CUDAArray/Texture/Surface - Lenia",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -405,16 +405,16 @@ def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
 def make_state_arrays():
     """Allocate the two single-channel `float` ping-pong arrays.
 
-    `surface_load_store=True` is what lets the same Array be bound as both a
+    `surface_load_store=True` is what lets the same CUDAArray be bound as both a
     TextureObject (sampled reads) and a SurfaceObject (typed writes).
     """
-    arr_a = Array.from_descriptor(
+    arr_a = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
         surface_load_store=True,
     )
-    arr_b = Array.from_descriptor(
+    arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
@@ -596,7 +596,7 @@ def on_draw():
         if now - fps_time >= 1.0:
             fps = frame_count / (now - fps_time)
             window.set_caption(
-                "cuda.core Array/Texture/Surface - Lenia"
+                "cuda.core CUDAArray/Texture/Surface - Lenia"
                 f" ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)"
             )
             frame_count = 0
diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py
index 11abca54c22..7b333980c42 100644
--- a/cuda_core/examples/gl_interop_mandelbrot.py
+++ b/cuda_core/examples/gl_interop_mandelbrot.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core.Array and TextureObject used as a *color
+# This example demonstrates cuda.core.CUDAArray and TextureObject used as a *color
 # lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer.
 # A CUDA kernel computes smooth iteration counts and uses tex1D<float4> with
 # LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette,
@@ -15,7 +15,7 @@
 
 # What this example teaches
 # =========================
-# - How to use a 1D cuda.core.Array as a palette and bind it via a
+# - How to use a 1D cuda.core.CUDAArray as a palette and bind it via a
 #   TextureObject for hardware-filtered color lookups inside a kernel.
 # - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized
 #   coordinates give you a free `texture(palette, t)` style sampler that
@@ -29,7 +29,7 @@
 # z = 0; pixels are colored by how quickly z escapes the disk of radius 2.
 #
 #     +---------+   ResourceDescriptor.from_array
-#     |  Array  | --------------------------------+
+#     |  CUDAArray  | --------------------------------+
 #     | float4  |                                 v
 #     | size 256|                       +-------------------+
 #     +---------+                       |   TextureObject   |
@@ -105,7 +105,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -142,7 +142,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# Array/TextureObject as a palette LUT, skip ahead to main() -- the interesting
+# CUDAArray/TextureObject as a palette LUT, skip ahead to main() -- the interesting
 # part is there. These helpers exist so that main() reads like a short story
 # instead of a wall of boilerplate.
 # ============================================================================
@@ -198,7 +198,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core Array/Texture - Mandelbrot Deep Zoom",
+        caption="cuda.core CUDAArray/Texture - Mandelbrot Deep Zoom",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -349,7 +349,7 @@ def build_palette():
     """Build a 256-entry RGBA float32 palette by lerping through color stops.
 
     Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32
-    suitable for Array.copy_from(). Each color channel is in [0, 1].
+    suitable for CUDAArray.copy_from(). Each color channel is in [0, 1].
     """
     # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red ->
     # magenta -> black (the final stop is used by points that hit max_iter
@@ -381,7 +381,7 @@ def build_palette():
         pal[i] = colors[j] + seg * (colors[j + 1] - colors[j])
 
     # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a
-    # float4 x PALETTE_SIZE 1D Array.
+    # float4 x PALETTE_SIZE 1D CUDAArray.
     return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32)
 
 
@@ -423,12 +423,12 @@ def main():
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
     # --- Step 6: Build and upload the palette LUT ---
-    #     One 1D Array, 256 entries of float4 RGBA. The host-side palette is
+    #     One 1D CUDAArray, 256 entries of float4 RGBA. The host-side palette is
     #     a flat numpy float32 array; copy_from() does an async H2D copy, so
     #     we sync the stream once afterwards to make sure the data has landed
     #     before we start sampling from it in the render loop.
     host_palette = build_palette()
-    palette_arr = Array.from_descriptor(
+    palette_arr = CUDAArray.from_descriptor(
         shape=(PALETTE_SIZE,),
         format=ArrayFormat.FLOAT32,
         num_channels=4,
@@ -436,7 +436,7 @@ def main():
     palette_arr.copy_from(host_palette, stream=stream)
     stream.sync()
 
-    # --- Step 7: Bind the palette Array as a TextureObject (LUT) ---
+    # --- Step 7: Bind the palette CUDAArray as a TextureObject (LUT) ---
     palette_tex = make_palette_texture(palette_arr)
 
     # --- Step 8: Render loop ---
@@ -550,7 +550,7 @@ def on_draw():
             fps = frame_count / (now - fps_time)
             zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0
             window.set_caption(
-                "cuda.core Array/Texture - Mandelbrot"
+                "cuda.core CUDAArray/Texture - Mandelbrot"
                 f" | zoom {zoom:.3e}x"
                 f" | center ({view['cx']:.6f}, {view['cy']:.6f})"
                 f" | iter {view['max_iter']}"
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
index 38b09513464..a5c6f55cf7c 100644
--- a/cuda_core/examples/gl_interop_mipmap_lod.py
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -45,7 +45,7 @@
 #         |      +---- one SurfaceObject per level, used at BUILD time only
 #         |            to let a kernel write pixels into that level.
 #         |
-#         +----------- get_level(L) returns a NON-OWNING Array view of level L;
+#         +----------- get_level(L) returns a NON-OWNING CUDAArray view of level L;
 #                      the storage belongs to the parent MipmappedArray.
 #
 #   STARTUP -- one-time mipmap build
@@ -94,7 +94,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -562,7 +562,7 @@ def on_close():
 // --------------------------------------------------------------------------
 // seed_base: write a procedural high-frequency pattern to level 0.
 //
-// surf is a SurfaceObject bound to the level-0 Array (float4 RGBA). The
+// surf is a SurfaceObject bound to the level-0 CUDAArray (float4 RGBA). The
 // pattern is a colorful blend of concentric rings, a diagonal grid, and a
 // radial sweep, designed to have plenty of fine detail so the difference
 // between mip levels is visually obvious.
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
index 177e7b8d320..aaea9cd88aa 100644
--- a/cuda_core/examples/gl_interop_ocean.py
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -4,9 +4,9 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
 # in combination with GraphicsResource for CUDA/OpenGL interop. A real-time
-# Gerstner-wave ocean is rebuilt every frame: a heightmap Array is rewritten
+# Gerstner-wave ocean is rebuilt every frame: a heightmap CUDAArray is rewritten
 # through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP
 # filtering for normal estimation, and shaded with Phong + Fresnel sky
 # reflection straight into an OpenGL PBO. Requires pyglet.
@@ -15,13 +15,13 @@
 
 # What this example teaches
 # =========================
-# - How to use a CUDA Array as a typed heightmap that is simultaneously
+# - How to use a CUDA CUDAArray as a typed heightmap that is simultaneously
 #   written by one kernel (via SurfaceObject) and sampled by another (via
 #   TextureObject) within the same frame.
 # - How LINEAR filtering + WRAP addressing + normalized coordinates gives
 #   essentially-free bilinear neighbor lookups for finite-difference normal
 #   estimation on a tiling heightmap.
-# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
 #   the entire render path never leaves the GPU.
 #
 # How it works
@@ -41,7 +41,7 @@
 #   ~~~~~~~~~~~~~~~~~~~~~~
 #   +-----------------+   surf2Dwrite   +--------------+
 #   |   update_height | --------------> |  heightmap   |
-#   |     kernel      |                 |    Array     |
+#   |     kernel      |                 |    CUDAArray     |
 #   +-----------------+                 |  (FLOAT32)   |
 #                                       +--------------+
 #                                              |
@@ -88,7 +88,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -134,7 +134,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
 # part is there. These helpers exist so that main() reads like a short story
 # instead of a wall of boilerplate.
 # ============================================================================
@@ -209,7 +209,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core Array/Texture/Surface - Gerstner Ocean",
+        caption="cuda.core CUDAArray/Texture/Surface - Gerstner Ocean",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -309,8 +309,8 @@ def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
 
 
 def make_heightmap_array():
-    """Allocate the single-channel float heightmap Array."""
-    return Array.from_descriptor(
+    """Allocate the single-channel float heightmap CUDAArray."""
+    return CUDAArray.from_descriptor(
         shape=(GRID, GRID),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
@@ -367,7 +367,7 @@ def main():
     # --- Step 5: Register the PBO with CUDA ---
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
-    # --- Step 6: Allocate the heightmap Array and build its texture/surface ---
+    # --- Step 6: Allocate the heightmap CUDAArray and build its texture/surface ---
     #     We pre-create both the TextureObject (read path) and the
     #     SurfaceObject (write path) once and reuse them every frame. Creating
     #     them inside the per-frame loop would work but adds per-frame overhead
@@ -457,7 +457,7 @@ def on_draw():
             label = PRESETS[state["preset"]][2]
             paused = " [paused]" if state["paused"] else ""
             window.set_caption(
-                "cuda.core Array/Texture/Surface - Gerstner Ocean"
+                "cuda.core CUDAArray/Texture/Surface - Gerstner Ocean"
                 f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)"
             )
             frame_count = 0
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
index b30603721a1..12a59b9be03 100644
--- a/cuda_core/examples/gl_interop_reaction_diffusion.py
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
 # in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott
 # reaction-diffusion simulation is ping-ponged between two CUDA arrays each
 # frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a
@@ -15,12 +15,12 @@
 
 # What this example teaches
 # =========================
-# - How to allocate a CUDA Array with `surface_load_store=True` so the same
+# - How to allocate a CUDA CUDAArray with `surface_load_store=True` so the same
 #   memory can be bound as both a TextureObject (for sampled reads) and a
 #   SurfaceObject (for typed writes).
 # - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates
 #   to get free hardware bilinear interpolation on a toroidal world.
-# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
 #   the entire simulation never leaves the GPU.
 #
 # How it works
@@ -33,7 +33,7 @@
 #
 # Different choices of F and k yield strikingly different patterns: coral,
 # mitosis, spots, and many more. We pack (U, V) into the two channels of a
-# `float2` Array.
+# `float2` CUDAArray.
 #
 #   PING-PONG (two arrays, swap each step)
 #   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -89,7 +89,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -128,7 +128,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
 # part is there. These helpers exist so that main() reads like a short story
 # instead of a wall of boilerplate.
 # ============================================================================
@@ -198,7 +198,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core Array/Texture/Surface - Gray-Scott Reaction Diffusion",
+        caption="cuda.core CUDAArray/Texture/Surface - Gray-Scott Reaction Diffusion",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -347,13 +347,13 @@ def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
 
 def make_state_arrays():
     """Allocate the two `float2` ping-pong arrays that hold the (U, V) state."""
-    arr_a = Array.from_descriptor(
+    arr_a = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=2,
         surface_load_store=True,
     )
-    arr_b = Array.from_descriptor(
+    arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=2,
@@ -520,7 +520,7 @@ def on_draw():
             fps = frame_count / (now - fps_time)
             label = PRESETS[state["preset"]][2]
             window.set_caption(
-                "cuda.core Array/Texture/Surface - Gray-Scott"
+                "cuda.core CUDAArray/Texture/Surface - Gray-Scott"
                 f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
                 f" {N_STEPS} steps/frame)"
             )
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
index 05299cc278f..75c3b6518f2 100644
--- a/cuda_core/examples/gl_interop_sdf_volume.py
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This example demonstrates cuda.core's 3D Array + trilinear TextureObject by
+# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by
 # baking a procedural Signed Distance Field (SDF) volume once at startup and
 # then ray-marching it every frame to render an orbitable 3D scene. The
 # SurfaceObject is used during the one-shot bake; the TextureObject (with
@@ -15,7 +15,7 @@
 
 # What this example teaches
 # =========================
-# - How to allocate a 3D cuda.core.Array (cuArray3DCreate under the hood) and
+# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and
 #   bind it as both a SurfaceObject (for one-shot kernel writes) and a
 #   TextureObject (for hardware-accelerated trilinear sampling).
 # - How to ray-march a baked SDF volume from a CUDA kernel, sampling via
@@ -44,10 +44,10 @@
 #
 #   STARTUP (one-shot bake)
 #   ~~~~~~~~~~~~~~~~~~~~~~~
-#   1. Allocate 3D Array (128^3, FLOAT32 x1, surface_load_store=True).
+#   1. Allocate 3D CUDAArray (128^3, FLOAT32 x1, surface_load_store=True).
 #   2. Bind it as a SurfaceObject.
 #   3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite.
-#   4. Close the SurfaceObject; the Array stays alive.
+#   4. Close the SurfaceObject; the CUDAArray stays alive.
 #
 #   EACH FRAME
 #   ~~~~~~~~~~
@@ -80,7 +80,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -116,7 +116,7 @@
 # ============================= Helper functions =============================
 #
 # The functions below set up CUDA and OpenGL. If you're here to learn about
-# 3D Array / TextureObject / SurfaceObject, skip ahead to main() -- the
+# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the
 # interesting part is there. These helpers exist so that main() reads like a
 # short story instead of a wall of boilerplate.
 # ============================================================================
@@ -159,7 +159,7 @@ def setup_cuda():
 
 def make_volume_array():
     """Allocate the 3D SDF volume. Single-channel float, surface-capable."""
-    return Array.from_descriptor(
+    return CUDAArray.from_descriptor(
         shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
@@ -191,7 +191,7 @@ def bake_volume(stream, kernels, arr):
     The SurfaceObject lives only for the duration of this call; once the bake
     is enqueued and the kernel has captured the bindless handle into its
     arguments, we sync the stream before letting the SurfaceObject close.
-    The Array itself outlives this scope -- it's the long-lived backing store
+    The CUDAArray itself outlives this scope -- it's the long-lived backing store
     for the render-loop TextureObject.
     """
     with SurfaceObject.from_array(arr) as bake_surf:
@@ -228,7 +228,7 @@ def create_window():
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,
-        caption="cuda.core 3D Array - SDF Volume Ray-Marcher",
+        caption="cuda.core 3D CUDAArray - SDF Volume Ray-Marcher",
         vsync=False,
     )
     return window, _gl, pyglet
@@ -354,9 +354,9 @@ def main():
     dev, stream, kernels = setup_cuda()
 
     # --- Step 2: Allocate the 3D SDF volume and bake it once ---
-    #     The Array is the long-lived backing store; it must outlive the
+    #     The CUDAArray is the long-lived backing store; it must outlive the
     #     render loop. The SurfaceObject is only needed for the one-shot bake
-    #     and is closed before we ever bind a TextureObject to the same Array.
+    #     and is closed before we ever bind a TextureObject to the same CUDAArray.
     arr = make_volume_array()
     bake_volume(stream, kernels, arr)
 
@@ -429,7 +429,7 @@ def on_draw():
             frame_count[0] = 0
             fps_time[0] = now
             window.set_caption(
-                "cuda.core 3D Array - SDF Volume Ray-Marcher  "
+                "cuda.core 3D CUDAArray - SDF Volume Ray-Marcher  "
                 f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} "
                 f"dist={cam['dist']:.2f}  "
                 f"{last_fps[0]:.0f} FPS  {last_frame_ms[0]:.2f} ms/frame"
@@ -527,9 +527,9 @@ def on_close():
 
 // --------------------------------------------------------------------------
 // bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere
-//           into a single-channel float 3D Array via a SurfaceObject.
+//           into a single-channel float 3D CUDAArray via a SurfaceObject.
 //
-//   surf is bound to a (size^3, FLOAT32 x 1) Array allocated with
+//   surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with
 //   surface_load_store=True.
 //   surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float));
 //   y and z are in elements. Off-by-one on the byte conversion silently
diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py
index 82c880a8943..aafe0e0d4c1 100644
--- a/cuda_core/examples/gl_interop_texture_filter.py
+++ b/cuda_core/examples/gl_interop_texture_filter.py
@@ -6,13 +6,13 @@
 #
 # This example demonstrates cuda.core.TextureObject hardware filtering by
 # comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same
-# source CUDA Array. Requires pyglet.
+# source CUDA CUDAArray. Requires pyglet.
 #
 # ################################################################################
 
 # What this example teaches
 # =========================
-# How to back two TextureObjects with the SAME CUDA Array and observe the
+# How to back two TextureObjects with the SAME CUDA CUDAArray and observe the
 # difference between POINT (nearest-texel) and LINEAR (bilinear) filtering
 # under user-controlled zoom and pan.  Also shows how the address mode
 # (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at
@@ -20,11 +20,11 @@
 #
 # How it works
 # ============
-# A single 256x256 RGBA8 Array holds a procedurally-generated test pattern
+# A single 256x256 RGBA8 CUDAArray holds a procedurally-generated test pattern
 # (high-contrast checkerboard, diagonals, gradient stripe).  Two
-# TextureObjects are built on top of that Array:
+# TextureObjects are built on top of that CUDAArray:
 #
-#       Array (256x256 RGBA UINT8)
+#       CUDAArray (256x256 RGBA UINT8)
 #       /                       \
 #   tex_point                  tex_linear
 #   FilterMode.POINT           FilterMode.LINEAR
@@ -68,7 +68,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -158,7 +158,7 @@ def fill_rect(y0, y1, x0, x1, rgba):
 
 
 def make_textures(array, address_mode):
-    """Build (tex_point, tex_linear) on the given Array with the given mode.
+    """Build (tex_point, tex_linear) on the given CUDAArray with the given mode.
 
     The address mode is baked into the descriptor at cuTexObjectCreate time, so
     we recreate both textures whenever the user cycles the mode.  Caller owns
@@ -362,11 +362,11 @@ def main():
     # --- Step 5: Register the PBO with CUDA ---
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
-    # --- Step 6: Allocate the source Array and upload the test pattern ---
-    #     The Array lives for the entire program, so we use a `with` block.
+    # --- Step 6: Allocate the source CUDAArray and upload the test pattern ---
+    #     The CUDAArray lives for the entire program, so we use a `with` block.
     #     Inside it we create / re-create two TextureObjects whenever the
     #     user cycles the address mode.
-    with Array.from_descriptor(
+    with CUDAArray.from_descriptor(
         shape=(SRC_W, SRC_H),
         format=ArrayFormat.UINT8,
         num_channels=4,
@@ -525,7 +525,7 @@ def on_close():
 
 # ======================== GPU code (CUDA + GLSL) ============================
 #
-# KERNEL_SOURCE samples the same source Array through two TextureObjects
+# KERNEL_SOURCE samples the same source CUDAArray through two TextureObjects
 # (POINT vs LINEAR) and writes RGBA8 pixels into the PBO.  ReadMode.
 # NORMALIZED_FLOAT means tex2D<float4>() returns each channel in [0, 1];
 # the kernel scales by 255 and writes unsigned bytes back out.
diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
index fc5b05f086f..3ed168cf0f7 100644
--- a/cuda_core/examples/texture_sample.py
+++ b/cuda_core/examples/texture_sample.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This example demonstrates building a 2D CUDA Array, binding it as a
+# This example demonstrates building a 2D CUDA CUDAArray, binding it as a
 # bindless TextureObject, and sampling it from a kernel with both POINT-exact
 # and LINEAR-interpolated coordinates.
 #
@@ -23,7 +23,7 @@
 
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -65,12 +65,12 @@ def main():
     out_buf = None
     pinned_mr = LegacyPinnedMemoryResource()
     try:
-        # Allocate a 2D Array: shape=(W, H), single-channel float32.
-        # Note: Array.from_descriptor takes shape=(width, height), so the host
+        # Allocate a 2D CUDAArray: shape=(W, H), single-channel float32.
+        # Note: CUDAArray.from_descriptor takes shape=(width, height), so the host
         # buffer fed into copy_from must be laid out as H rows of W elements
         # (row-major), i.e. host_pattern.shape == (H, W).
         width, height = 16, 16
-        with Array.from_descriptor(
+        with CUDAArray.from_descriptor(
             shape=(width, height),
             format=ArrayFormat.FLOAT32,
             num_channels=1,
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index 00e67ed2398..b9359fdf818 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -8,7 +8,7 @@
 import cuda.core
 from cuda.core import (
     AddressMode,
-    Array,
+    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -22,8 +22,8 @@
 
 
 def test_array_init_disabled():
-    with pytest.raises(RuntimeError, match=r"^Array cannot be instantiated directly"):
-        cuda.core._array.Array()
+    with pytest.raises(RuntimeError, match=r"^CUDAArray cannot be instantiated directly"):
+        cuda.core._array.CUDAArray()
 
 
 def test_texture_object_init_disabled():
@@ -42,7 +42,7 @@ def test_resource_descriptor_init_disabled():
 
 
 def test_array_2d_create_and_properties(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -51,7 +51,7 @@ def test_array_2d_create_and_properties(init_cuda):
         assert arr.num_channels == 1
         assert arr.element_size == 4
         assert arr.size_bytes == 32 * 16 * 4
-        assert arr.surface_load_store is False
+        assert arr.is_surface_load_store is False
         assert arr.handle != 0
         assert isinstance(arr.device, Device)
     finally:
@@ -59,7 +59,7 @@ def test_array_2d_create_and_properties(init_cuda):
 
 
 def test_array_3d_with_surface_flag(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8, 4),
         format=ArrayFormat.UINT8,
         num_channels=4,
@@ -67,7 +67,7 @@ def test_array_3d_with_surface_flag(init_cuda):
     )
     try:
         assert arr.shape == (8, 8, 4)
-        assert arr.surface_load_store is True
+        assert arr.is_surface_load_store is True
         assert arr.element_size == 4
     finally:
         arr.close()
@@ -75,12 +75,12 @@ def test_array_3d_with_surface_flag(init_cuda):
 
 def test_array_rejects_bad_channels(init_cuda):
     with pytest.raises(ValueError, match="num_channels"):
-        Array.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3)
+        CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3)
 
 
 def test_array_rejects_bad_rank(init_cuda):
     with pytest.raises(ValueError, match="shape rank"):
-        Array.from_descriptor(
+        CUDAArray.from_descriptor(
             shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1
         )
 
@@ -90,7 +90,7 @@ def test_array_roundtrip_copy(init_cuda):
 
     device = Device()
     stream = device.create_stream()
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(16,), format=ArrayFormat.UINT32, num_channels=1
     )
     try:
@@ -112,7 +112,7 @@ def test_array_copy_rejects_undersized_host_buffer(init_cuda):
 
     device = Device()
     stream = device.create_stream()
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(16,), format=ArrayFormat.UINT32, num_channels=1
     )
     try:
@@ -130,7 +130,7 @@ def test_array_copy_rejects_undersized_host_buffer(init_cuda):
 def test_array_copy_rejects_undersized_device_buffer(init_cuda):
     device = Device()
     stream = device.create_stream()
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(16,), format=ArrayFormat.UINT32, num_channels=1
     )
     # arr is 64 bytes; allocate a 32-byte device buffer.
@@ -147,7 +147,7 @@ def test_array_copy_rejects_undersized_device_buffer(init_cuda):
 
 
 def test_texture_object_create(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -170,7 +170,7 @@ def test_texture_object_create(init_cuda):
 
 
 def test_surface_object_create(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8),
         format=ArrayFormat.UINT8,
         num_channels=4,
@@ -188,7 +188,7 @@ def test_surface_object_create(init_cuda):
 
 
 def test_surface_requires_ldst_flag(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4
     )
     try:
@@ -214,7 +214,7 @@ def test_address_mode_normalization(init_cuda):
     ) == (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
 
     # Smoke test: a 2-entry tuple is also accepted end-to-end.
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -457,7 +457,7 @@ def test_mipmapped_array_from_descriptor_2d(init_cuda):
         assert mip.format == ArrayFormat.FLOAT32
         assert mip.num_channels == 1
         assert mip.num_levels == 4
-        assert mip.surface_load_store is False
+        assert mip.is_surface_load_store is False
         assert mip.handle != 0
         assert isinstance(mip.device, Device)
     finally:
@@ -475,7 +475,7 @@ def test_mipmapped_array_get_level_zero_matches_shape(init_cuda):
     try:
         lvl0 = mip.get_level(0)
         try:
-            assert isinstance(lvl0, Array)
+            assert isinstance(lvl0, CUDAArray)
             # Level 0 must match the base shape and rank.
             assert lvl0.shape == shape
             assert lvl0.format == ArrayFormat.UINT8
@@ -603,11 +603,11 @@ def test_surface_rejects_mipmapped_array(init_cuda):
 
 
 def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
-    """Dropping the local parent reference must not invalidate the level Array;
+    """Dropping the local parent reference must not invalidate the level CUDAArray;
     the level holds an internal strong ref back to the MipmappedArray.
 
     cdef classes don't natively support weakref, so we verify the parent
-    reference by inspecting the level Array's gc referents.
+    reference by inspecting the level CUDAArray's gc referents.
     """
     mip = MipmappedArray.from_descriptor(
         shape=(16, 16),
@@ -618,7 +618,7 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
     parent_id = id(mip)
     lvl = mip.get_level(1)
     # Drop our local reference and force GC; the parent must survive because
-    # the level Array holds a strong ref via the internal _parent_ref slot.
+    # the level CUDAArray holds a strong ref via the internal _parent_ref slot.
     del mip
     gc.collect()
 
@@ -627,11 +627,11 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
     referents = gc.get_referents(lvl)
     parents = [r for r in referents if isinstance(r, MipmappedArray)]
     assert len(parents) == 1, (
-        f"level Array should reference exactly one MipmappedArray parent, got "
+        f"level CUDAArray should reference exactly one MipmappedArray parent, got "
         f"{parents!r}"
     )
     assert id(parents[0]) == parent_id, (
-        "level Array's parent ref is not the original MipmappedArray"
+        "level CUDAArray's parent ref is not the original MipmappedArray"
     )
     # Closing the level drops its parent ref. Don't access the parent past
     # this point; cuMipmappedArrayDestroy may then run.
@@ -642,23 +642,23 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
 
 def test_array_from_descriptor_rejects_bad_format(init_cuda):
     with pytest.raises(TypeError, match="format must be an ArrayFormat"):
-        Array.from_descriptor(shape=(8,), format=0, num_channels=1)
+        CUDAArray.from_descriptor(shape=(8,), format=0, num_channels=1)
 
 
 def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda):
     with pytest.raises(TypeError, match="shape must be a tuple"):
-        Array.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1)
+        CUDAArray.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1)
 
 
 def test_array_from_descriptor_rejects_zero_dim(init_cuda):
     with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"):
-        Array.from_descriptor(
+        CUDAArray.from_descriptor(
             shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1
         )
 
 
 def test_array_copy_rejects_non_stream(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8,), format=ArrayFormat.UINT8, num_channels=1
     )
     try:
@@ -773,7 +773,7 @@ def test_texture_object_rejects_non_resource_descriptor(init_cuda):
 
 
 def test_texture_object_rejects_non_texture_descriptor(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -787,7 +787,7 @@ def test_texture_object_rejects_non_texture_descriptor(init_cuda):
 
 
 def test_texture_object_rejects_bad_filter_mode(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -800,7 +800,7 @@ def test_texture_object_rejects_bad_filter_mode(init_cuda):
 
 
 def test_texture_object_rejects_bad_read_mode(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -813,7 +813,7 @@ def test_texture_object_rejects_bad_read_mode(init_cuda):
 
 
 def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -828,7 +828,7 @@ def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
 
 
 def test_texture_object_rejects_negative_anisotropy(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -841,7 +841,7 @@ def test_texture_object_rejects_negative_anisotropy(init_cuda):
 
 
 def test_texture_object_rejects_bad_border_color_length(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -854,7 +854,7 @@ def test_texture_object_rejects_bad_border_color_length(init_cuda):
 
 
 def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -867,7 +867,7 @@ def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
 
 
 def test_address_mode_rejects_empty_tuple(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -880,7 +880,7 @@ def test_address_mode_rejects_empty_tuple(init_cuda):
 
 
 def test_address_mode_rejects_too_long_tuple(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -897,7 +897,7 @@ def test_address_mode_rejects_too_long_tuple(init_cuda):
 
 
 def test_address_mode_rejects_non_addressmode_entry(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     try:
@@ -910,10 +910,10 @@ def test_address_mode_rejects_non_addressmode_entry(init_cuda):
 
 
 def test_texture_object_keeps_backing_array_alive(init_cuda):
-    """Dropping the local references to the backing Array and the
+    """Dropping the local references to the backing CUDAArray and the
     ResourceDescriptor must NOT invalidate an existing TextureObject. The
     TextureObject holds a strong ref through its _source_ref slot."""
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
     )
     res = ResourceDescriptor.from_array(arr)
@@ -921,7 +921,7 @@ def test_texture_object_keeps_backing_array_alive(init_cuda):
         resource=res, texture_descriptor=TextureDescriptor()
     )
     # Verify the keepalive chain via gc referents: TextureObject -> _source_ref
-    # -> ResourceDescriptor -> _source -> Array. We can only walk one level
+    # -> ResourceDescriptor -> _source -> CUDAArray. We can only walk one level
     # at a time, so check tex's referents include the ResourceDescriptor.
     arr_id = id(arr)
     res_id = id(res)
@@ -936,7 +936,7 @@ def test_texture_object_keeps_backing_array_alive(init_cuda):
     )
     res_back = res_refs[0]
     arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id]
-    assert len(arr_refs) == 1, "ResourceDescriptor should still reference its Array"
+    assert len(arr_refs) == 1, "ResourceDescriptor should still reference its CUDAArray"
 
     # tex.handle should still be valid (non-zero).
     assert tex.handle != 0
@@ -944,7 +944,7 @@ def test_texture_object_keeps_backing_array_alive(init_cuda):
 
 
 def test_surface_object_keeps_backing_array_alive(init_cuda):
-    arr = Array.from_descriptor(
+    arr = CUDAArray.from_descriptor(
         shape=(8, 8),
         format=ArrayFormat.UINT8,
         num_channels=4,
@@ -955,14 +955,14 @@ def test_surface_object_keeps_backing_array_alive(init_cuda):
     del arr
     gc.collect()
 
-    # The surface keeps the ResourceDescriptor alive, which keeps the Array
+    # The surface keeps the ResourceDescriptor alive, which keeps the CUDAArray
     # alive. We verify the chain end-to-end the same way as the texture case.
     referents = gc.get_referents(surf)
     res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)]
     assert len(res_objs) == 1
     arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id]
     assert len(arr_refs) == 1, (
-        "SurfaceObject should still reference its backing Array via the ResourceDescriptor"
+        "SurfaceObject should still reference its backing CUDAArray via the ResourceDescriptor"
     )
     assert surf.handle != 0
     surf.close()

From 264f2e6f64786458dc591f50bb3acb6c4df15426 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 9 Jun 2026 18:50:42 -0700
Subject: [PATCH 12/17] cuda.core: fix lint/mypy after merging main; add
 generated .pyi stubs

Post-merge cleanup so pre-commit.ci passes on the texture/surface stack:

- Switch ReadMode/AddressMode/FilterMode/ArrayFormat to `from enum import
  IntEnum` so stubgen-pyx preserves the IntEnum base in the generated stubs
  (qualified `enum.IntEnum` was dropped, making members infer as `int` and
  failing mypy assignment checks).
- Annotate TextureDescriptor.border_color as `tuple[float, ...] | None`
  (disallow_any_generics flagged the bare `tuple`).
- Prefix unused pyglet event-handler args with `_` (ARG001) and lowercase
  in-function locals (N806) across the GL interop examples; drop dead
  pre-try buffer inits in texture_sample.main (F841).
- Commit the auto-generated .pyi stubs for the new _array/_texture/_surface/
  _mipmapped_array modules.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py               |  22 +-
 cuda_core/cuda/core/_array.pyi                | 148 ++++++++++
 cuda_core/cuda/core/_array.pyx                |   4 +-
 cuda_core/cuda/core/_mipmapped_array.pyi      | 112 ++++++++
 cuda_core/cuda/core/_surface.pyi              |  68 +++++
 cuda_core/cuda/core/_texture.pyi              | 259 ++++++++++++++++++
 cuda_core/cuda/core/_texture.pyx              |  10 +-
 cuda_core/examples/gl_interop_fire.py         |  83 ++++--
 cuda_core/examples/gl_interop_image_show.py   |  60 ++--
 cuda_core/examples/gl_interop_lenia.py        |   7 +-
 cuda_core/examples/gl_interop_mandelbrot.py   |   4 +-
 cuda_core/examples/gl_interop_mipmap_lod.py   |  61 +++--
 cuda_core/examples/gl_interop_ocean.py        |  76 +++--
 .../examples/gl_interop_reaction_diffusion.py |   6 +-
 cuda_core/examples/gl_interop_sdf_volume.py   |  54 ++--
 .../examples/gl_interop_texture_filter.py     |  78 ++++--
 cuda_core/examples/texture_sample.py          |  12 +-
 cuda_core/tests/test_texture_surface.py       | 204 +++++---------
 18 files changed, 957 insertions(+), 311 deletions(-)
 create mode 100644 cuda_core/cuda/core/_array.pyi
 create mode 100644 cuda_core/cuda/core/_mipmapped_array.pyi
 create mode 100644 cuda_core/cuda/core/_surface.pyi
 create mode 100644 cuda_core/cuda/core/_texture.pyi

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index d7d210a4129..0db06b05e14 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -69,6 +69,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
 
 
 from cuda.core import checkpoint, system, utils
+from cuda.core._array import ArrayFormat, CUDAArray
 from cuda.core._context import Context, ContextOptions
 from cuda.core._device import Device
 from cuda.core._device_resources import (
@@ -78,17 +79,6 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResource,
     WorkqueueResourceOptions,
 )
-from cuda.core._array import CUDAArray, ArrayFormat
-from cuda.core._mipmapped_array import MipmappedArray
-from cuda.core._texture import (
-    AddressMode,
-    FilterMode,
-    ReadMode,
-    ResourceDescriptor,
-    TextureDescriptor,
-    TextureObject,
-)
-from cuda.core._surface import SurfaceObject
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
 from cuda.core._host import Host
@@ -110,6 +100,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
+from cuda.core._mipmapped_array import MipmappedArray
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
@@ -118,7 +109,16 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     Stream,
     StreamOptions,
 )
+from cuda.core._surface import SurfaceObject
 from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions
+from cuda.core._texture import (
+    AddressMode,
+    FilterMode,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+)
 
 # isort: split
 # Must come after the cuda.core._* extension imports above: loading graph
diff --git a/cuda_core/cuda/core/_array.pyi b/cuda_core/cuda/core/_array.pyi
new file mode 100644
index 00000000000..3c765c60c02
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pyi
@@ -0,0 +1,148 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_array.pyx
+
+from __future__ import annotations
+
+from enum import IntEnum
+
+from cuda.bindings import cydriver
+
+
+class ArrayFormat(IntEnum):
+    """Element format for a :class:`CUDAArray` allocation.
+
+    Mirrors ``CUarray_format`` from the CUDA driver API.
+    """
+    UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8
+    UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16
+    UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32
+    INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8
+    INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16
+    INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32
+    FLOAT16 = cydriver.CU_AD_FORMAT_HALF
+    FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT
+
+class CUDAArray:
+    """An opaque, hardware-laid-out GPU allocation for texture/surface access.
+
+    Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
+    and can only be accessed from kernels through a :class:`TextureObject` or
+    :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D
+    spatial locality.
+
+    Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are
+    supported in this initial version; layered/cubemap/sparse variants will
+    follow once their shape semantics are settled.
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUarray`` if owned by this object."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False):
+        """Allocate a new CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
+            can be bound as a :class:`SurfaceObject` for kernel-side writes.
+            Default False.
+
+        Returns
+        -------
+        CUDAArray
+        """
+
+    @classmethod
+    def _from_handle(cls, handle: int, owning: bool, *, device_id=None):
+        """Wrap an externally-allocated ``CUarray``.
+
+        Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``)
+        where the array is owned by the graphics API. With ``owning=False``,
+        :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
+        format, and channel count are queried from the driver.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUarray`` as an integer."""
+
+    @property
+    def shape(self):
+        """Allocation shape, in elements."""
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+
+    @property
+    def element_size(self):
+        """Bytes per element (format size * channels)."""
+
+    @property
+    def device(self):
+        """The :class:`Device` this array was allocated on."""
+
+    @property
+    def is_surface_load_store(self):
+        """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
+        and can be bound as a :class:`SurfaceObject`."""
+
+    def _extent_bytes(self):
+        """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
+        normalized to >=1 for lower-rank arrays."""
+
+    def copy_from(self, src, *, stream):
+        """Copy a full-array's worth of data into this array.
+
+        Parameters
+        ----------
+        src : Buffer or buffer-protocol object
+            Source data. Must contain at least ``self.size_bytes`` bytes
+            of contiguous data.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+
+    def copy_to(self, dst, *, stream):
+        """Copy a full-array's worth of data out of this array.
+
+        Parameters
+        ----------
+        dst : Buffer or writable buffer-protocol object
+            Destination. Must have at least ``self.size_bytes`` bytes of
+            writable, contiguous space.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+
+    @property
+    def size_bytes(self):
+        """Total bytes of array storage (``prod(shape) * element_size``)."""
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
+_FORMAT_ELEM_SIZE = {int(ArrayFormat.UINT8): 1, int(ArrayFormat.INT8): 1, int(ArrayFormat.UINT16): 2, int(ArrayFormat.INT16): 2, int(ArrayFormat.FLOAT16): 2, int(ArrayFormat.UINT32): 4, int(ArrayFormat.INT32): 4, int(ArrayFormat.FLOAT32): 4}
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 36920d61156..d17129e275b 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -17,10 +17,10 @@ from cuda.core._utils.cuda_utils cimport (
     _get_current_device_id,
 )
 
-import enum
+from enum import IntEnum
 
 
-class ArrayFormat(enum.IntEnum):
+class ArrayFormat(IntEnum):
     """Element format for a :class:`CUDAArray` allocation.
 
     Mirrors ``CUarray_format`` from the CUDA driver API.
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyi b/cuda_core/cuda/core/_mipmapped_array.pyi
new file mode 100644
index 00000000000..06ca4615c40
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pyi
@@ -0,0 +1,112 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_mipmapped_array.pyx
+
+from __future__ import annotations
+
+
+class MipmappedArray:
+    """A mipmapped CUDA array for texture/surface access across levels.
+
+    Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
+    allocation accessible only via a :class:`TextureObject` (or by retrieving
+    the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`).
+    Destroying the :class:`MipmappedArray` destroys all level arrays
+    implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level`
+    are non-owning and hold a strong reference back to their parent.
+
+    Construct via :meth:`from_descriptor`.
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUmipmappedArray`` if owned.
+
+        After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level`
+        becomes invalid; callers must not access them.
+        """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, num_levels, surface_load_store=False):
+        """Allocate a new mipmapped CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements, for the base (level 0) mip.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        num_levels : int
+            Number of mip levels to allocate; must be >= 1. The driver caps
+            this at the log2 of the largest dimension; passing a larger value
+            yields a driver error.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
+            levels (obtained via :meth:`get_level`) can be bound as
+            :class:`SurfaceObject` for kernel-side writes. Default False.
+
+        Returns
+        -------
+        MipmappedArray
+        """
+
+    def get_level(self, level):
+        """Return a non-owning :class:`CUDAArray` view of the given mip level.
+
+        Parameters
+        ----------
+        level : int
+            Mip level index in ``[0, num_levels)``.
+
+        Returns
+        -------
+        CUDAArray
+            A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``.
+            The :class:`MipmappedArray` is kept alive for the lifetime of the
+            returned :class:`CUDAArray`; the underlying storage is released only
+            when this :class:`MipmappedArray` is destroyed.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUmipmappedArray`` as an integer."""
+
+    @property
+    def shape(self):
+        """Base-level (level 0) allocation shape, in elements."""
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+
+    @property
+    def num_levels(self):
+        """Number of mip levels."""
+
+    @property
+    def is_surface_load_store(self):
+        """True if this mipmap (and each of its levels) was created with
+        ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
+
+    @property
+    def device(self):
+        """The :class:`Device` this mipmap was allocated on."""
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_surface.pyi b/cuda_core/cuda/core/_surface.pyi
new file mode 100644
index 00000000000..8961f8ce82a
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pyi
@@ -0,0 +1,68 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_surface.pyx
+
+from __future__ import annotations
+
+
+class SurfaceObject:
+    """A bindless surface handle for kernel-side typed load/store.
+
+    Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface
+    has no sampling state (no filtering, no addressing modes, no normalization);
+    kernels read and write through it using integer pixel coordinates.
+
+    The backing :class:`CUDAArray` must have been created with
+    ``surface_load_store=True`` and is kept alive for the lifetime of this
+    object to prevent dangling handles.
+
+    Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
+    kernels as a 64-bit handle (via the ``handle`` property).
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUsurfObject``."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_array(cls, array):
+        """Create a surface object directly from an :class:`CUDAArray`.
+
+        The array must have been created with ``surface_load_store=True``.
+        """
+
+    @classmethod
+    def from_descriptor(cls, *, resource):
+        """Create a surface object from a :class:`ResourceDescriptor`.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+            Must wrap an :class:`CUDAArray` allocated with
+            ``surface_load_store=True``. Linear/pitch2d resources are not
+            valid surface backings.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg)."""
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this surface was built from."""
+
+    @property
+    def device(self):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_texture.pyi b/cuda_core/cuda/core/_texture.pyi
new file mode 100644
index 00000000000..1365597e6e4
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pyi
@@ -0,0 +1,259 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_texture.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum
+
+from cuda.bindings import cydriver
+
+
+class AddressMode(IntEnum):
+    """Boundary behavior for out-of-range texture coordinates."""
+    WRAP = cydriver.CU_TR_ADDRESS_MODE_WRAP
+    CLAMP = cydriver.CU_TR_ADDRESS_MODE_CLAMP
+    MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR
+    BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
+
+class FilterMode(IntEnum):
+    """Texel sampling mode."""
+    POINT = cydriver.CU_TR_FILTER_MODE_POINT
+    LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
+
+class ReadMode(IntEnum):
+    """How sampled values are returned to the kernel.
+
+    - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
+      integer, float stays float).
+    - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized
+      ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed).
+      Float formats are unaffected.
+    """
+    ELEMENT_TYPE = 0
+    NORMALIZED_FLOAT = 1
+
+class ResourceDescriptor:
+    """Describes the memory backing a :class:`TextureObject`.
+
+    Construct via the ``from_*`` classmethods:
+
+    - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
+      :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
+      objects built from a linear resource do not support filtering,
+      normalized coordinates, or addressing modes.
+    - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image.
+      Supports filtering and 2D addressing, but only 2D access.
+
+    Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
+    require an :class:`CUDAArray` allocated with ``surface_load_store=True``.
+    """
+    __slots__ = ('_kind', '_source', '_format', '_num_channels', '_size_bytes', '_width', '_height', '_pitch_bytes')
+
+    def __init__(self):
+        ...
+
+    @classmethod
+    def from_array(cls, array):
+        """Build a resource descriptor backed by a :class:`CUDAArray`."""
+
+    @classmethod
+    def from_mipmapped_array(cls, mipmapped_array):
+        """Build a resource descriptor backed by a :class:`MipmappedArray`.
+
+        Suitable for binding to a :class:`TextureObject` for mipmapped
+        sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
+        require a single :class:`CUDAArray` level (obtain via
+        :meth:`MipmappedArray.get_level`).
+        """
+
+    @classmethod
+    def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
+        """Build a resource descriptor for a linear (typed 1D) texture fetch.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        size_bytes : int, optional
+            Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not
+            exceed it.
+
+        Notes
+        -----
+        Texture objects built from a linear resource ignore the
+        :class:`TextureDescriptor` addressing/filtering fields — kernels read
+        through a typed 1D fetch with bounds checking only.
+        """
+
+    @classmethod
+    def from_pitch2d(cls, buffer, *, format, num_channels, width, height, pitch_bytes):
+        """Build a resource descriptor for a row-pitched 2D image.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        width : int
+            Image width, in elements.
+        height : int
+            Image height, in rows.
+        pitch_bytes : int
+            Distance between consecutive rows, in bytes. Must be at least
+            ``width * format_size * num_channels`` and meet the driver's
+            ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``.
+        """
+
+    @property
+    def kind(self):
+        ...
+
+    @property
+    def source(self):
+        ...
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat` (``None`` for array-backed)."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (``None`` for array-backed)."""
+
+    @property
+    def size_bytes(self):
+        """Bytes bound for a linear resource (``None`` for other kinds)."""
+
+    @property
+    def width(self):
+        """Pitch2D image width, in elements (``None`` for other kinds)."""
+
+    @property
+    def height(self):
+        """Pitch2D image height, in rows (``None`` for other kinds)."""
+
+    @property
+    def pitch_bytes(self):
+        """Pitch2D row pitch, in bytes (``None`` for other kinds)."""
+
+    def __repr__(self):
+        ...
+
+@dataclass
+class TextureDescriptor:
+    """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``).
+
+    Attributes
+    ----------
+    address_mode : tuple of AddressMode
+        Boundary behavior per axis. May be a single :class:`AddressMode` (applied
+        to all axes) or a tuple of 1-3 entries (one per dimension).
+    filter_mode : FilterMode
+        Texel sampling mode. Default ``POINT``.
+    read_mode : ReadMode
+        How sampled integer values are returned. Default ``ELEMENT_TYPE``.
+    normalized_coords : bool
+        If True, coordinates are in ``[0, 1]`` instead of pixel indices.
+    srgb : bool
+        If True, perform sRGB → linear conversion on read (8-bit formats only).
+    disable_trilinear_optimization : bool
+        If True, request exact trilinear filtering.
+    seamless_cubemap : bool
+        If True, enable seamless cubemap edge filtering.
+    max_anisotropy : int
+        Maximum anisotropy; 0 disables anisotropic filtering.
+    mipmap_filter_mode : FilterMode
+        Filtering between mipmap levels. Default ``POINT``.
+    mipmap_level_bias : float
+    min_mipmap_level_clamp : float
+    max_mipmap_level_clamp : float
+    border_color : tuple of float or None
+        4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
+        zero.
+    """
+    address_mode: object = AddressMode.CLAMP
+    filter_mode: FilterMode = FilterMode.POINT
+    read_mode: ReadMode = ReadMode.ELEMENT_TYPE
+    normalized_coords: bool = False
+    srgb: bool = False
+    disable_trilinear_optimization: bool = False
+    seamless_cubemap: bool = False
+    max_anisotropy: int = 0
+    mipmap_filter_mode: FilterMode = FilterMode.POINT
+    mipmap_level_bias: float = 0.0
+    min_mipmap_level_clamp: float = 0.0
+    max_mipmap_level_clamp: float = 0.0
+    border_color: tuple[float, ...] | None = None
+
+class TextureObject:
+    """A bindless texture handle for kernel-side sampled reads.
+
+    Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
+    :class:`CUDAArray` referenced by the descriptor) is kept alive for the
+    lifetime of this object to prevent dangling handles.
+
+    Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
+    handle (via the ``handle`` property).
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUtexObject``."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, resource, texture_descriptor):
+        """Create a texture object from a resource + sampling descriptor.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+        texture_descriptor : TextureDescriptor
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUtexObject`` as an integer (64-bit kernel arg)."""
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this texture was built from."""
+
+    @property
+    def texture_descriptor(self):
+        """The :class:`TextureDescriptor` this texture was built from."""
+
+    @property
+    def device(self):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
+_TRSF_READ_AS_INTEGER = 1
+_TRSF_NORMALIZED_COORDINATES = 2
+_TRSF_SRGB = 16
+_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 32
+_TRSF_SEAMLESS_CUBEMAP = 64
+
+def _normalize_address_modes(address_mode):
+    """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index aeaa2ace4bb..18d828480a3 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -19,8 +19,8 @@ from cuda.core._utils.cuda_utils cimport (
     _get_current_device_id,
 )
 
-import enum
 from dataclasses import dataclass
+from enum import IntEnum
 
 
 # Driver texture-descriptor flag bits (CU_TRSF_*).
@@ -31,7 +31,7 @@ _TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20
 _TRSF_SEAMLESS_CUBEMAP = 0x40
 
 
-class AddressMode(enum.IntEnum):
+class AddressMode(IntEnum):
     """Boundary behavior for out-of-range texture coordinates."""
     WRAP   = cydriver.CU_TR_ADDRESS_MODE_WRAP
     CLAMP  = cydriver.CU_TR_ADDRESS_MODE_CLAMP
@@ -39,13 +39,13 @@ class AddressMode(enum.IntEnum):
     BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
 
 
-class FilterMode(enum.IntEnum):
+class FilterMode(IntEnum):
     """Texel sampling mode."""
     POINT  = cydriver.CU_TR_FILTER_MODE_POINT
     LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
 
 
-class ReadMode(enum.IntEnum):
+class ReadMode(IntEnum):
     """How sampled values are returned to the kernel.
 
     - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
@@ -349,7 +349,7 @@ class TextureDescriptor:
     mipmap_level_bias: float = 0.0
     min_mipmap_level_clamp: float = 0.0
     max_mipmap_level_clamp: float = 0.0
-    border_color: tuple | None = None
+    border_color: tuple[float, ...] | None = None
 
 
 def _normalize_address_modes(address_mode):
diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
index 14e241f561a..21b861c442c 100644
--- a/cuda_core/examples/gl_interop_fire.py
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -89,8 +89,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -227,12 +227,30 @@ def create_display_resources(gl, width, height):
     quad_verts = np.array(
         [
             # x,  y,    s, t      (position + texture coordinate)
-            -1, -1, 0, 1,
-             1, -1, 1, 1,
-             1,  1, 1, 0,
-            -1, -1, 0, 1,
-             1,  1, 1, 0,
-            -1,  1, 0, 0,
+            -1,
+            -1,
+            0,
+            1,
+            1,
+            -1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            -1,
+            -1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            -1,
+            1,
+            0,
+            0,
         ],
         dtype=np.float32,
     )
@@ -377,15 +395,42 @@ def build_fire_palette():
     https://github.com/tiagomenegaz/doom-fire.
     """
     rgb = [
-        (  7,   7,   7), ( 31,   7,   7), ( 47,  15,   7), ( 71,  15,   7),
-        ( 87,  23,   7), (103,  31,   7), (119,  31,   7), (143,  39,   7),
-        (159,  47,   7), (175,  63,   7), (191,  71,   7), (199,  71,   7),
-        (223,  79,   7), (223,  87,   7), (223,  87,   7), (215,  95,   7),
-        (215,  95,   7), (215, 103,  15), (207, 111,  15), (207, 119,  15),
-        (207, 127,  15), (207, 135,  23), (199, 135,  23), (199, 143,  23),
-        (199, 151,  31), (191, 159,  31), (191, 159,  31), (191, 167,  39),
-        (191, 167,  39), (191, 175,  47), (183, 175,  47), (183, 183,  47),
-        (183, 183,  55), (207, 207, 111), (223, 223, 159), (239, 239, 199),
+        (7, 7, 7),
+        (31, 7, 7),
+        (47, 15, 7),
+        (71, 15, 7),
+        (87, 23, 7),
+        (103, 31, 7),
+        (119, 31, 7),
+        (143, 39, 7),
+        (159, 47, 7),
+        (175, 63, 7),
+        (191, 71, 7),
+        (199, 71, 7),
+        (223, 79, 7),
+        (223, 87, 7),
+        (223, 87, 7),
+        (215, 95, 7),
+        (215, 95, 7),
+        (215, 103, 15),
+        (207, 111, 15),
+        (207, 119, 15),
+        (207, 127, 15),
+        (207, 135, 23),
+        (199, 135, 23),
+        (199, 143, 23),
+        (199, 151, 31),
+        (191, 159, 31),
+        (191, 159, 31),
+        (191, 167, 39),
+        (191, 167, 39),
+        (191, 175, 47),
+        (183, 175, 47),
+        (183, 183, 47),
+        (183, 183, 55),
+        (207, 207, 111),
+        (223, 223, 159),
+        (239, 239, 199),
         (255, 255, 255),
     ]
     # Index 0 (the "no fire" color) is rendered as pure black so dead pixels
@@ -468,9 +513,9 @@ def main():
 
     # The heat field is born zeroed by CUDAArray.from_descriptor. No seed pass.
     state = {
-        "current": "a",            # which array holds the latest heat field
-        "frame_index": 0,           # passed into the step kernel as `t`
-        "ambient": True,            # SPACE toggles bottom-row injection
+        "current": "a",  # which array holds the latest heat field
+        "frame_index": 0,  # passed into the step kernel as `t`
+        "ambient": True,  # SPACE toggles bottom-row injection
         "mouse_down": False,
         "mouse_x": 0,
         "mouse_y": 0,
diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py
index 53dc3807e28..7678d457b10 100644
--- a/cuda_core/examples/gl_interop_image_show.py
+++ b/cuda_core/examples/gl_interop_image_show.py
@@ -55,7 +55,6 @@
 # ///
 
 import ctypes
-import math
 import sys
 import time
 
@@ -63,8 +62,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -170,12 +169,30 @@ def create_display_resources(gl, width, height):
 
     quad_verts = np.array(
         [
-            -1, -1, 0, 0,
-             1, -1, 1, 0,
-             1,  1, 1, 1,
-            -1, -1, 0, 0,
-             1,  1, 1, 1,
-            -1,  1, 0, 1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
         ],
         dtype=np.float32,
     )
@@ -209,8 +226,15 @@ def create_display_resources(gl, width, height):
     gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
     gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
     gl.glTexImage2D(
-        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
-        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
     )
     return shader_prog, vao.value, tex.value
 
@@ -230,8 +254,15 @@ def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
     gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
     gl.glTexSubImage2D(
-        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
-        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
     )
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
 
@@ -303,10 +334,7 @@ def on_key_press(symbol, _modifiers):
         elif symbol == key.F:
             # Filter mode is baked at TextureObject creation time. Swapping
             # it means closing the old one and building a new one.
-            state["filter"] = (
-                FilterMode.LINEAR if state["filter"] == FilterMode.POINT
-                else FilterMode.POINT
-            )
+            state["filter"] = FilterMode.LINEAR if state["filter"] == FilterMode.POINT else FilterMode.POINT
             tex.close()
             tex = make_texture(arr, state["filter"])
         elif symbol == key.R:
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
index 4a16689987f..2d44e470253 100644
--- a/cuda_core/examples/gl_interop_lenia.py
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -112,8 +112,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -595,10 +595,7 @@ def on_draw():
         now = time.monotonic()
         if now - fps_time >= 1.0:
             fps = frame_count / (now - fps_time)
-            window.set_caption(
-                "cuda.core CUDAArray/Texture/Surface - Lenia"
-                f" ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)"
-            )
+            window.set_caption(f"cuda.core CUDAArray/Texture/Surface - Lenia ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)")
             frame_count = 0
             fps_time = now
 
diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py
index 7b333980c42..73671d77e95 100644
--- a/cuda_core/examples/gl_interop_mandelbrot.py
+++ b/cuda_core/examples/gl_interop_mandelbrot.py
@@ -105,8 +105,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -527,7 +527,7 @@ def on_draw():
                 config,
                 kernel,
                 np.uint64(palette_tex.handle),  # bindless texture handle
-                buf.handle,                     # output PBO (RGBA8)
+                buf.handle,  # output PBO (RGBA8)
                 np.int32(WIDTH),
                 np.int32(HEIGHT),
                 np.float64(view["cx"]),
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
index a5c6f55cf7c..227ce5c5f65 100644
--- a/cuda_core/examples/gl_interop_mipmap_lod.py
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -94,7 +94,6 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
     Device,
     FilterMode,
@@ -134,8 +133,7 @@ def _check_compute_capability(dev):
     cc = dev.compute_capability
     if cc.major < 3:
         print(
-            f"This example requires compute capability >= 3.0, "
-            f"got sm_{cc.major}{cc.minor}.",
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
             file=sys.stderr,
         )
         sys.exit(1)
@@ -207,9 +205,9 @@ def build_mipmap_pyramid(mip, num_levels, stream, kernels):
     # at the end of their `with` blocks.
     src_tex_desc = TextureDescriptor(
         address_mode=AddressMode.CLAMP,
-        filter_mode=FilterMode.POINT,        # explicit per-texel reads
+        filter_mode=FilterMode.POINT,  # explicit per-texel reads
         read_mode=ReadMode.ELEMENT_TYPE,
-        normalized_coords=False,             # integer pixel coordinates
+        normalized_coords=False,  # integer pixel coordinates
     )
     for level in range(1, num_levels):
         parent_size = BASE_SIZE >> (level - 1)
@@ -220,9 +218,10 @@ def build_mipmap_pyramid(mip, num_levels, stream, kernels):
         src_arr = mip.get_level(level - 1)
         dst_arr = mip.get_level(level)
         src_res = ResourceDescriptor.from_array(src_arr)
-        with TextureObject.from_descriptor(
-            resource=src_res, texture_descriptor=src_tex_desc
-        ) as src_tex, SurfaceObject.from_array(dst_arr) as dst_surf:
+        with (
+            TextureObject.from_descriptor(resource=src_res, texture_descriptor=src_tex_desc) as src_tex,
+            SurfaceObject.from_array(dst_arr) as dst_surf,
+        ):
             block = (16, 16, 1)
             grid = (
                 (level_size + block[0] - 1) // block[0],
@@ -282,12 +281,30 @@ def create_display_resources(gl, width, height):
     quad_verts = np.array(
         [
             # x,  y,    s, t      (position + texture coordinate)
-            -1, -1, 0, 0,
-             1, -1, 1, 0,
-             1,  1, 1, 1,
-            -1, -1, 0, 0,
-             1,  1, 1, 1,
-            -1,  1, 0, 1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
         ],
         dtype=np.float32,
     )
@@ -409,7 +426,7 @@ def main():
         filter_mode=FilterMode.LINEAR,
         read_mode=ReadMode.ELEMENT_TYPE,
         normalized_coords=True,
-        mipmap_filter_mode=FilterMode.LINEAR,    # trilinear
+        mipmap_filter_mode=FilterMode.LINEAR,  # trilinear
         mipmap_level_bias=0.0,
         min_mipmap_level_clamp=0.0,
         max_mipmap_level_clamp=float(num_levels - 1),
@@ -489,24 +506,20 @@ def on_draw():
             fps_time[0] = now
 
     @window.event
-    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
         # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range.
         if scroll_y == 0:
             return
-        factor = 1.125 ** scroll_y
+        factor = 1.125**scroll_y
         state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor))
 
     @window.event
-    def on_key_press(symbol, modifiers):
+    def on_key_press(symbol, _modifiers):
         key = pyglet.window.key
         if symbol == key.BRACKETLEFT:
-            state["lod_bias"] = max(
-                -float(num_levels), state["lod_bias"] - LOD_BIAS_STEP
-            )
+            state["lod_bias"] = max(-float(num_levels), state["lod_bias"] - LOD_BIAS_STEP)
         elif symbol == key.BRACKETRIGHT:
-            state["lod_bias"] = min(
-                float(num_levels), state["lod_bias"] + LOD_BIAS_STEP
-            )
+            state["lod_bias"] = min(float(num_levels), state["lod_bias"] + LOD_BIAS_STEP)
         elif symbol == key.R:
             state["zoom"] = 1.0
             state["lod_bias"] = 0.0
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
index aaea9cd88aa..1ededb081a8 100644
--- a/cuda_core/examples/gl_interop_ocean.py
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -88,8 +88,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -123,10 +123,10 @@
 DEFAULT_PRESET = "2"
 
 # Initial camera (orbit-around-origin) parameters.
-INITIAL_YAW = 0.6        # radians around world-y
-INITIAL_PITCH = 0.35     # radians above the horizon (small positive = looking down)
-INITIAL_DISTANCE = 5.0   # camera distance from origin
-PITCH_LIMIT = 1.4        # clamp |pitch| to keep basis non-degenerate (< pi/2)
+INITIAL_YAW = 0.6  # radians around world-y
+INITIAL_PITCH = 0.35  # radians above the horizon (small positive = looking down)
+INITIAL_DISTANCE = 5.0  # camera distance from origin
+PITCH_LIMIT = 1.4  # clamp |pitch| to keep basis non-degenerate (< pi/2)
 ZOOM_MIN = 1.5
 ZOOM_MAX = 30.0
 
@@ -231,12 +231,30 @@ def create_display_resources(gl, width, height):
     # Fullscreen quad (two triangles covering the entire window).
     quad_verts = np.array(
         [
-            -1, -1, 0, 0,
-             1, -1, 1, 0,
-             1,  1, 1, 1,
-            -1, -1, 0, 0,
-             1,  1, 1, 1,
-            -1,  1, 0, 1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
         ],
         dtype=np.float32,
     )
@@ -270,8 +288,15 @@ def create_display_resources(gl, width, height):
     gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
     gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
     gl.glTexImage2D(
-        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
-        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
     )
     return shader_prog, vao.value, tex.value
 
@@ -292,8 +317,15 @@ def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
     gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
     gl.glTexSubImage2D(
-        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
-        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
     )
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
 
@@ -425,9 +457,7 @@ def on_draw():
         # (b) Render the scene: sample the heightmap through the texture,
         #     estimate normals via finite differences, shade with Phong +
         #     Fresnel sky reflection, write RGBA8 into the OpenGL PBO.
-        cam_x, cam_y, cam_z = orbit_camera_position(
-            state["yaw"], state["pitch"], state["distance"]
-        )
+        cam_x, cam_y, cam_z = orbit_camera_position(state["yaw"], state["pitch"], state["distance"])
         with resource.map(stream=stream) as buf:
             launch(
                 stream,
@@ -465,17 +495,17 @@ def on_draw():
 
     # --- Mouse: drag to orbit, scroll to zoom ------------------------------
     @window.event
-    def on_mouse_press(x, y, button, modifiers):
+    def on_mouse_press(_x, _y, button, _modifiers):
         if button == pyglet.window.mouse.LEFT:
             state["drag"] = True
 
     @window.event
-    def on_mouse_release(x, y, button, modifiers):
+    def on_mouse_release(_x, _y, button, _modifiers):
         if button == pyglet.window.mouse.LEFT:
             state["drag"] = False
 
     @window.event
-    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
         if not (buttons & pyglet.window.mouse.LEFT):
             return
         # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw
@@ -490,7 +520,7 @@ def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
             state["pitch"] = -PITCH_LIMIT
 
     @window.event
-    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
         # Geometric zoom in camera distance; clamp to a sensible range.
         factor = 1.1 ** (-scroll_y)
         new_d = state["distance"] * factor
@@ -498,7 +528,7 @@ def on_mouse_scroll(x, y, scroll_x, scroll_y):
 
     # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ----------
     @window.event
-    def on_key_press(symbol, modifiers):
+    def on_key_press(symbol, _modifiers):
         key = pyglet.window.key
         if symbol == key.ESCAPE:
             window.close()
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
index 12a59b9be03..dc34b213bd1 100644
--- a/cuda_core/examples/gl_interop_reaction_diffusion.py
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -89,8 +89,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -470,7 +470,7 @@ def on_draw():
         nonlocal frame_count, fps_time
 
         window.clear()
-        F, k, _label = PRESETS[state["preset"]]
+        f, k, _label = PRESETS[state["preset"]]
 
         # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one
         #     array via a TextureObject (LINEAR + WRAP gives wrapping +
@@ -487,7 +487,7 @@ def on_draw():
                 np.int32(HEIGHT),
                 np.float32(DU),
                 np.float32(DV),
-                np.float32(F),
+                np.float32(f),
                 np.float32(k),
                 np.float32(DT),
             )
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
index 75c3b6518f2..7fbf8dcd1fe 100644
--- a/cuda_core/examples/gl_interop_sdf_volume.py
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -72,7 +72,6 @@
 # ///
 
 import ctypes
-import math
 import sys
 import time
 
@@ -80,8 +79,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -101,13 +100,13 @@
 # ---------------------------------------------------------------------------
 WIDTH = 800
 HEIGHT = 600
-VOLUME_SIZE = 128   # 128^3 voxels; bake cost is one-shot.
+VOLUME_SIZE = 128  # 128^3 voxels; bake cost is one-shot.
 
 # Camera defaults / clamps.
 RESET_YAW = 0.0
 RESET_PITCH = 0.3
 RESET_DIST = 2.5
-PITCH_MIN = -1.45    # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MIN = -1.45  # stay inside (-pi/2, pi/2) so the up-vector stays sane.
 PITCH_MAX = 1.45
 DIST_MIN = 1.2
 DIST_MAX = 8.0
@@ -127,8 +126,7 @@ def _check_compute_capability(dev):
     cc = dev.compute_capability
     if cc.major < 3:
         print(
-            f"This example requires compute capability >= 3.0, "
-            f"got sm_{cc.major}{cc.minor}.",
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
             file=sys.stderr,
         )
         sys.exit(1)
@@ -249,12 +247,30 @@ def create_display_resources(gl, width, height):
     quad_verts = np.array(
         [
             # x,  y,    s, t      (position + texture coordinate)
-            -1, -1, 0, 0,
-             1, -1, 1, 0,
-             1,  1, 1, 1,
-            -1, -1, 0, 0,
-             1,  1, 1, 1,
-            -1,  1, 0, 1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
         ],
         dtype=np.float32,
     )
@@ -436,15 +452,15 @@ def on_draw():
             )
 
     @window.event
-    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
         # Left-click drag orbits the camera. dx -> yaw (sign convention chosen
         # so that dragging right rotates the scene right); dy -> pitch (drag
         # up tilts the camera up).
         if not (buttons & pyglet.window.mouse.LEFT):
             return
-        ORBIT_SCALE = 0.005
-        cam["yaw"] += dx * ORBIT_SCALE
-        cam["pitch"] += dy * ORBIT_SCALE
+        orbit_scale = 0.005
+        cam["yaw"] += dx * orbit_scale
+        cam["pitch"] += dy * orbit_scale
         # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)).
         if cam["pitch"] < PITCH_MIN:
             cam["pitch"] = PITCH_MIN
@@ -452,19 +468,19 @@ def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
             cam["pitch"] = PITCH_MAX
 
     @window.event
-    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
         # Scroll wheel zoom: geometric so each tick feels uniform regardless
         # of current distance. Positive scroll_y (wheel up) zooms in.
         if scroll_y == 0:
             return
-        cam["dist"] *= 0.9 ** scroll_y
+        cam["dist"] *= 0.9**scroll_y
         if cam["dist"] < DIST_MIN:
             cam["dist"] = DIST_MIN
         elif cam["dist"] > DIST_MAX:
             cam["dist"] = DIST_MAX
 
     @window.event
-    def on_key_press(symbol, modifiers):
+    def on_key_press(symbol, _modifiers):
         key = pyglet.window.key
         if symbol == key.ESCAPE:
             window.close()
diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py
index aafe0e0d4c1..27c8bcb99fa 100644
--- a/cuda_core/examples/gl_interop_texture_filter.py
+++ b/cuda_core/examples/gl_interop_texture_filter.py
@@ -68,8 +68,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     GraphicsResource,
@@ -130,8 +130,8 @@ def make_pattern(width, height):
     img[..., 3] = 255
 
     # Two diagonal red lines.
-    diag1 = (xs == ys)
-    diag2 = (xs == (width - 1 - ys))
+    diag1 = xs == ys
+    diag2 = xs == (width - 1 - ys)
     red_mask = diag1 | diag2
     img[red_mask] = (255, 0, 0, 255)
 
@@ -141,8 +141,8 @@ def make_pattern(width, height):
     grad = np.linspace(0, 255, width, dtype=np.uint8)
     for row in range(g_y, min(g_y + g_h, height)):
         img[row, :, 0] = 0
-        img[row, :, 1] = grad             # G ramps up
-        img[row, :, 2] = 255 - grad       # B ramps down
+        img[row, :, 1] = grad  # G ramps up
+        img[row, :, 2] = 255 - grad  # B ramps down
         img[row, :, 3] = 255
 
     # Two "text-like" thin rectangles, alternating bright/dim.
@@ -151,8 +151,7 @@ def fill_rect(y0, y1, x0, x1, rgba):
 
     bar_y = (3 * height) // 4
     fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255))
-    fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8,
-              (0, 255, 255, 255))
+    fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8, (0, 255, 255, 255))
 
     return np.ascontiguousarray(img)
 
@@ -178,12 +177,8 @@ def make_textures(array, address_mode):
         read_mode=ReadMode.NORMALIZED_FLOAT,
         normalized_coords=False,
     )
-    tex_point = TextureObject.from_descriptor(
-        resource=res_desc, texture_descriptor=point_desc
-    )
-    tex_linear = TextureObject.from_descriptor(
-        resource=res_desc, texture_descriptor=linear_desc
-    )
+    tex_point = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=point_desc)
+    tex_linear = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=linear_desc)
     return tex_point, tex_linear
 
 
@@ -245,12 +240,30 @@ def create_display_resources(gl, width, height):
     # Fullscreen quad (two triangles).  Each vertex: x, y, s, t.
     quad_verts = np.array(
         [
-            -1, -1, 0, 0,
-             1, -1, 1, 0,
-             1,  1, 1, 1,
-            -1, -1, 0, 0,
-             1,  1, 1, 1,
-            -1,  1, 0, 1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
         ],
         dtype=np.float32,
     )
@@ -316,8 +329,15 @@ def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
     gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
     gl.glTexSubImage2D(
-        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
-        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
     )
     gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
 
@@ -373,9 +393,7 @@ def main():
     ) as arr:
         pattern = make_pattern(SRC_W, SRC_H)
         # Sanity: 256 * 256 * 4 bytes = 262144.
-        assert pattern.nbytes == arr.size_bytes, (
-            f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})"
-        )
+        assert pattern.nbytes == arr.size_bytes, f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})"
         arr.copy_from(pattern, stream=stream)
         stream.sync()  # upload must finish before kernel reads
 
@@ -472,17 +490,17 @@ def on_draw():
 
         # --- Mouse: drag to pan, scroll to zoom ------------------------------
         @window.event
-        def on_mouse_press(x, y, button, modifiers):
+        def on_mouse_press(_x, _y, button, _modifiers):
             if button == pyglet.window.mouse.LEFT:
                 view["drag"] = True
 
         @window.event
-        def on_mouse_release(x, y, button, modifiers):
+        def on_mouse_release(_x, _y, button, _modifiers):
             if button == pyglet.window.mouse.LEFT:
                 view["drag"] = False
 
         @window.event
-        def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+        def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
             if not (buttons & pyglet.window.mouse.LEFT):
                 return
             # Pyglet dy is screen-up-positive; texture y is texel-down-positive.
@@ -491,15 +509,15 @@ def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
             view["pan_y"] += dy / view["zoom"]
 
         @window.event
-        def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
             # Geometric zoom; clamp to a sensible range.
-            factor = 1.1 ** scroll_y
+            factor = 1.1**scroll_y
             new_zoom = view["zoom"] * factor
             view["zoom"] = max(0.1, min(32.0, new_zoom))
 
         # --- Keyboard: M cycles address mode, R resets view ------------------
         @window.event
-        def on_key_press(symbol, modifiers):
+        def on_key_press(symbol, _modifiers):
             key = pyglet.window.key
             if symbol == key.M:
                 tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES)
diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
index 3ed168cf0f7..78e9a463b89 100644
--- a/cuda_core/examples/texture_sample.py
+++ b/cuda_core/examples/texture_sample.py
@@ -23,8 +23,8 @@
 
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     LaunchConfig,
@@ -61,8 +61,6 @@ def main():
     dev.set_current()
     stream = dev.create_stream()
 
-    coords_buf = None
-    out_buf = None
     pinned_mr = LegacyPinnedMemoryResource()
     try:
         # Allocate a 2D CUDAArray: shape=(W, H), single-channel float32.
@@ -94,12 +92,8 @@ def main():
                 read_mode=ReadMode.ELEMENT_TYPE,
                 normalized_coords=False,
             )
-            with TextureObject.from_descriptor(
-                resource=res_desc, texture_descriptor=tex_desc
-            ) as tex:
-                _run_kernel_and_verify(
-                    dev, stream, tex, pattern, width, height, pinned_mr
-                )
+            with TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) as tex:
+                _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr)
     finally:
         stream.close()
 
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index b9359fdf818..46337ebbab3 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -8,8 +8,8 @@
 import cuda.core
 from cuda.core import (
     AddressMode,
-    CUDAArray,
     ArrayFormat,
+    CUDAArray,
     Device,
     FilterMode,
     MipmappedArray,
@@ -42,9 +42,7 @@ def test_resource_descriptor_init_disabled():
 
 
 def test_array_2d_create_and_properties(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         assert arr.shape == (32, 16)
         assert arr.format == ArrayFormat.FLOAT32
@@ -80,9 +78,7 @@ def test_array_rejects_bad_channels(init_cuda):
 
 def test_array_rejects_bad_rank(init_cuda):
     with pytest.raises(ValueError, match="shape rank"):
-        CUDAArray.from_descriptor(
-            shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1
-        )
+        CUDAArray.from_descriptor(shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1)
 
 
 def test_array_roundtrip_copy(init_cuda):
@@ -90,9 +86,7 @@ def test_array_roundtrip_copy(init_cuda):
 
     device = Device()
     stream = device.create_stream()
-    arr = CUDAArray.from_descriptor(
-        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
     try:
         src = _array.array("I", list(range(16)))
         dst = _array.array("I", [0] * 16)
@@ -112,9 +106,7 @@ def test_array_copy_rejects_undersized_host_buffer(init_cuda):
 
     device = Device()
     stream = device.create_stream()
-    arr = CUDAArray.from_descriptor(
-        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
     try:
         # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer.
         too_small = _array.array("I", [0] * 8)
@@ -130,9 +122,7 @@ def test_array_copy_rejects_undersized_host_buffer(init_cuda):
 def test_array_copy_rejects_undersized_device_buffer(init_cuda):
     device = Device()
     stream = device.create_stream()
-    arr = CUDAArray.from_descriptor(
-        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
     # arr is 64 bytes; allocate a 32-byte device buffer.
     small_buf = device.memory_resource.allocate(32, stream=device.default_stream)
     try:
@@ -147,9 +137,7 @@ def test_array_copy_rejects_undersized_device_buffer(init_cuda):
 
 
 def test_texture_object_create(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         tex_desc = TextureDescriptor(
@@ -188,9 +176,7 @@ def test_surface_object_create(init_cuda):
 
 
 def test_surface_requires_ldst_flag(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4)
     try:
         with pytest.raises(ValueError, match="surface_load_store=True"):
             SurfaceObject.from_array(arr)
@@ -204,24 +190,26 @@ def test_address_mode_normalization(init_cuda):
     from cuda.core._texture import _normalize_address_modes
 
     assert _normalize_address_modes(AddressMode.WRAP) == (
-        AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP,
+        AddressMode.WRAP,
+        AddressMode.WRAP,
+        AddressMode.WRAP,
     )
     assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == (
-        AddressMode.WRAP, AddressMode.CLAMP, AddressMode.CLAMP,
+        AddressMode.WRAP,
+        AddressMode.CLAMP,
+        AddressMode.CLAMP,
+    )
+    assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)) == (
+        AddressMode.WRAP,
+        AddressMode.CLAMP,
+        AddressMode.MIRROR,
     )
-    assert _normalize_address_modes(
-        (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
-    ) == (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
 
     # Smoke test: a 2-entry tuple is also accepted end-to-end.
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
-        tex_desc = TextureDescriptor(
-            address_mode=(AddressMode.WRAP, AddressMode.CLAMP)
-        )
+        tex_desc = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.CLAMP))
         tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
         try:
             assert tex.handle != 0
@@ -233,6 +221,7 @@ def test_address_mode_normalization(init_cuda):
 
 # --- Linear / pitch2D resource descriptors -----------------------------------
 
+
 def _alloc_device_buffer(device, nbytes):
     """Allocate a device Buffer using the device's default memory resource."""
     return device.memory_resource.allocate(nbytes, stream=device.default_stream)
@@ -242,9 +231,7 @@ def test_resource_descriptor_from_linear_defaults_size(init_cuda):
     device = Device()
     buf = _alloc_device_buffer(device, 4096)
     try:
-        res = ResourceDescriptor.from_linear(
-            buf, format=ArrayFormat.FLOAT32, num_channels=1
-        )
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1)
         assert res.kind == "linear"
         assert res.format == ArrayFormat.FLOAT32
         assert res.num_channels == 1
@@ -259,9 +246,7 @@ def test_resource_descriptor_from_linear_size_override(init_cuda):
     device = Device()
     buf = _alloc_device_buffer(device, 4096)
     try:
-        res = ResourceDescriptor.from_linear(
-            buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048
-        )
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048)
         assert res._size_bytes == 2048
     finally:
         buf.close()
@@ -272,9 +257,7 @@ def test_resource_descriptor_from_linear_rejects_oversize(init_cuda):
     buf = _alloc_device_buffer(device, 1024)
     try:
         with pytest.raises(ValueError, match="exceeds buffer.size"):
-            ResourceDescriptor.from_linear(
-                buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048
-            )
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048)
     finally:
         buf.close()
 
@@ -284,18 +267,14 @@ def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda):
     buf = _alloc_device_buffer(device, 1024)
     try:
         with pytest.raises(ValueError, match="num_channels"):
-            ResourceDescriptor.from_linear(
-                buf, format=ArrayFormat.UINT8, num_channels=3
-            )
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=3)
     finally:
         buf.close()
 
 
 def test_resource_descriptor_from_linear_rejects_non_buffer():
     with pytest.raises(TypeError, match="Buffer"):
-        ResourceDescriptor.from_linear(
-            object(), format=ArrayFormat.UINT8, num_channels=1
-        )
+        ResourceDescriptor.from_linear(object(), format=ArrayFormat.UINT8, num_channels=1)
 
 
 def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda):
@@ -303,9 +282,7 @@ def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda):
     buf = _alloc_device_buffer(device, 1024)
     try:
         with pytest.raises(ValueError, match="at least one element"):
-            ResourceDescriptor.from_linear(
-                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0
-            )
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0)
     finally:
         buf.close()
 
@@ -316,9 +293,7 @@ def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda):
     try:
         # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple.
         with pytest.raises(ValueError, match="multiple of element size"):
-            ResourceDescriptor.from_linear(
-                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10
-            )
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10)
     finally:
         buf.close()
 
@@ -330,9 +305,7 @@ def test_texture_object_from_linear(init_cuda):
     # 1024 float elements
     buf = _alloc_device_buffer(device, 1024 * 4)
     try:
-        res = ResourceDescriptor.from_linear(
-            buf, format=ArrayFormat.FLOAT32, num_channels=1
-        )
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1)
         tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
         try:
             assert tex.handle != 0
@@ -416,9 +389,7 @@ def test_surface_rejects_linear_and_pitch2d(init_cuda):
     device = Device()
     buf = _alloc_device_buffer(device, 4096)
     try:
-        res_lin = ResourceDescriptor.from_linear(
-            buf, format=ArrayFormat.UINT32, num_channels=1
-        )
+        res_lin = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1)
         with pytest.raises(ValueError, match="array-backed"):
             SurfaceObject.from_descriptor(resource=res_lin)
 
@@ -438,10 +409,9 @@ def test_surface_rejects_linear_and_pitch2d(init_cuda):
 
 # --- MipmappedArray ----------------------------------------------------------
 
+
 def test_mipmapped_array_init_disabled():
-    with pytest.raises(
-        RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"
-    ):
+    with pytest.raises(RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"):
         cuda.core._mipmapped_array.MipmappedArray()
 
 
@@ -502,9 +472,7 @@ def test_mipmapped_array_get_level_halves_dims(init_cuda):
             try:
                 # Each dim halves per level, with a floor of 1; rank is preserved.
                 expected = tuple(max(1, dim >> level) for dim in shape)
-                assert lvl.shape == expected, (
-                    f"level={level}: expected {expected}, got {lvl.shape}"
-                )
+                assert lvl.shape == expected, f"level={level}: expected {expected}, got {lvl.shape}"
             finally:
                 lvl.close()
     finally:
@@ -626,13 +594,8 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
     assert lvl.handle != 0
     referents = gc.get_referents(lvl)
     parents = [r for r in referents if isinstance(r, MipmappedArray)]
-    assert len(parents) == 1, (
-        f"level CUDAArray should reference exactly one MipmappedArray parent, got "
-        f"{parents!r}"
-    )
-    assert id(parents[0]) == parent_id, (
-        "level CUDAArray's parent ref is not the original MipmappedArray"
-    )
+    assert len(parents) == 1, f"level CUDAArray should reference exactly one MipmappedArray parent, got {parents!r}"
+    assert id(parents[0]) == parent_id, "level CUDAArray's parent ref is not the original MipmappedArray"
     # Closing the level drops its parent ref. Don't access the parent past
     # this point; cuMipmappedArrayDestroy may then run.
     lvl.close()
@@ -640,6 +603,7 @@ def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
 
 # --- Negative-path validation tests ------------------------------------------
 
+
 def test_array_from_descriptor_rejects_bad_format(init_cuda):
     with pytest.raises(TypeError, match="format must be an ArrayFormat"):
         CUDAArray.from_descriptor(shape=(8,), format=0, num_channels=1)
@@ -652,17 +616,14 @@ def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda):
 
 def test_array_from_descriptor_rejects_zero_dim(init_cuda):
     with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"):
-        CUDAArray.from_descriptor(
-            shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1
-        )
+        CUDAArray.from_descriptor(shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1)
 
 
 def test_array_copy_rejects_non_stream(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8,), format=ArrayFormat.UINT8, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=1)
     try:
         import array as _array
+
         buf = _array.array("B", [0] * 8)
         with pytest.raises(TypeError, match="stream must be a Stream"):
             arr.copy_from(buf, stream="not-a-stream")
@@ -746,50 +707,36 @@ def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda):
 
 def test_mipmapped_array_rejects_bad_format(init_cuda):
     with pytest.raises(TypeError, match="format must be an ArrayFormat"):
-        MipmappedArray.from_descriptor(
-            shape=(8, 8), format=0, num_channels=1, num_levels=2
-        )
+        MipmappedArray.from_descriptor(shape=(8, 8), format=0, num_channels=1, num_levels=2)
 
 
 def test_mipmapped_array_rejects_bad_channels(init_cuda):
     with pytest.raises(ValueError, match="num_channels"):
-        MipmappedArray.from_descriptor(
-            shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2
-        )
+        MipmappedArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2)
 
 
 def test_mipmapped_array_rejects_zero_dim(init_cuda):
     with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"):
-        MipmappedArray.from_descriptor(
-            shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1
-        )
+        MipmappedArray.from_descriptor(shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1)
 
 
 def test_texture_object_rejects_non_resource_descriptor(init_cuda):
     with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"):
-        TextureObject.from_descriptor(
-            resource=object(), texture_descriptor=TextureDescriptor()
-        )
+        TextureObject.from_descriptor(resource=object(), texture_descriptor=TextureDescriptor())
 
 
 def test_texture_object_rejects_non_texture_descriptor(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
-        with pytest.raises(
-            TypeError, match="texture_descriptor must be a TextureDescriptor"
-        ):
+        with pytest.raises(TypeError, match="texture_descriptor must be a TextureDescriptor"):
             TextureObject.from_descriptor(resource=res, texture_descriptor="nope")
     finally:
         arr.close()
 
 
 def test_texture_object_rejects_bad_filter_mode(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(filter_mode=0)  # int, not FilterMode
@@ -800,9 +747,7 @@ def test_texture_object_rejects_bad_filter_mode(init_cuda):
 
 
 def test_texture_object_rejects_bad_read_mode(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(read_mode=0)  # int, not ReadMode
@@ -813,24 +758,18 @@ def test_texture_object_rejects_bad_read_mode(init_cuda):
 
 
 def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(mipmap_filter_mode=0)  # int, not FilterMode
-        with pytest.raises(
-            TypeError, match="mipmap_filter_mode must be a FilterMode"
-        ):
+        with pytest.raises(TypeError, match="mipmap_filter_mode must be a FilterMode"):
             TextureObject.from_descriptor(resource=res, texture_descriptor=td)
     finally:
         arr.close()
 
 
 def test_texture_object_rejects_negative_anisotropy(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(max_anisotropy=-1)
@@ -841,9 +780,7 @@ def test_texture_object_rejects_negative_anisotropy(init_cuda):
 
 
 def test_texture_object_rejects_bad_border_color_length(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(border_color=(0.0, 0.0))  # length 2, not 4
@@ -854,9 +791,7 @@ def test_texture_object_rejects_bad_border_color_length(init_cuda):
 
 
 def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(address_mode=42)  # int, not AddressMode / iterable
@@ -867,9 +802,7 @@ def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
 
 
 def test_address_mode_rejects_empty_tuple(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(address_mode=())
@@ -880,16 +813,10 @@ def test_address_mode_rejects_empty_tuple(init_cuda):
 
 
 def test_address_mode_rejects_too_long_tuple(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
-        td = TextureDescriptor(
-            address_mode=(
-                AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP
-            )
-        )
+        td = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP))
         with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
             TextureObject.from_descriptor(resource=res, texture_descriptor=td)
     finally:
@@ -897,9 +824,7 @@ def test_address_mode_rejects_too_long_tuple(init_cuda):
 
 
 def test_address_mode_rejects_non_addressmode_entry(init_cuda):
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     try:
         res = ResourceDescriptor.from_array(arr)
         td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP))
@@ -913,13 +838,9 @@ def test_texture_object_keeps_backing_array_alive(init_cuda):
     """Dropping the local references to the backing CUDAArray and the
     ResourceDescriptor must NOT invalidate an existing TextureObject. The
     TextureObject holds a strong ref through its _source_ref slot."""
-    arr = CUDAArray.from_descriptor(
-        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
-    )
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
     res = ResourceDescriptor.from_array(arr)
-    tex = TextureObject.from_descriptor(
-        resource=res, texture_descriptor=TextureDescriptor()
-    )
+    tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
     # Verify the keepalive chain via gc referents: TextureObject -> _source_ref
     # -> ResourceDescriptor -> _source -> CUDAArray. We can only walk one level
     # at a time, so check tex's referents include the ResourceDescriptor.
@@ -931,8 +852,7 @@ def test_texture_object_keeps_backing_array_alive(init_cuda):
     referents = gc.get_referents(tex)
     res_refs = [r for r in referents if id(r) == res_id]
     assert len(res_refs) == 1, (
-        f"TextureObject should still reference the ResourceDescriptor; "
-        f"got referents {referents!r}"
+        f"TextureObject should still reference the ResourceDescriptor; got referents {referents!r}"
     )
     res_back = res_refs[0]
     arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id]
@@ -961,8 +881,6 @@ def test_surface_object_keeps_backing_array_alive(init_cuda):
     res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)]
     assert len(res_objs) == 1
     arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id]
-    assert len(arr_refs) == 1, (
-        "SurfaceObject should still reference its backing CUDAArray via the ResourceDescriptor"
-    )
+    assert len(arr_refs) == 1, "SurfaceObject should still reference its backing CUDAArray via the ResourceDescriptor"
     assert surf.handle != 0
     surf.close()

From 79823083d6489aa2ec0ba31d4c5184596aa77dbb Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 10 Jun 2026 13:40:06 -0700
Subject: [PATCH 13/17] cuda.core: rename surface_load_store ctor keyword to
 is_surface_load_store

Closes the open detail on design issue #2188 item #6: the read-back property
is already `is_surface_load_store`, so rename the `from_descriptor` keyword on
both `CUDAArray` and `MipmappedArray` to match, giving one symmetric name for
set and read-back (following the existing `StridedMemoryView(is_readonly=...)`
precedent). Updates call sites in tests and GL examples, the SurfaceObject
error message + docstrings, and regenerates the .pyi stubs. The unrelated
GraphicsResource `"surface_load_store"` register-flag string is left untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_array.pyi                      |  4 ++--
 cuda_core/cuda/core/_array.pyx                      |  8 ++++----
 cuda_core/cuda/core/_mipmapped_array.pyi            |  4 ++--
 cuda_core/cuda/core/_mipmapped_array.pyx            |  8 ++++----
 cuda_core/cuda/core/_surface.pyi                    |  6 +++---
 cuda_core/cuda/core/_surface.pyx                    |  8 ++++----
 cuda_core/cuda/core/_texture.pyi                    |  2 +-
 cuda_core/cuda/core/_texture.pyx                    |  2 +-
 cuda_core/examples/gl_interop_fire.py               |  4 ++--
 cuda_core/examples/gl_interop_lenia.py              | 10 +++++-----
 cuda_core/examples/gl_interop_mipmap_lod.py         |  6 +++---
 cuda_core/examples/gl_interop_ocean.py              |  2 +-
 cuda_core/examples/gl_interop_reaction_diffusion.py |  8 ++++----
 cuda_core/examples/gl_interop_sdf_volume.py         |  6 +++---
 cuda_core/tests/test_texture_surface.py             | 10 +++++-----
 15 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/cuda_core/cuda/core/_array.pyi b/cuda_core/cuda/core/_array.pyi
index 3c765c60c02..7dcf4ad5a0a 100644
--- a/cuda_core/cuda/core/_array.pyi
+++ b/cuda_core/cuda/core/_array.pyi
@@ -41,7 +41,7 @@ class CUDAArray:
         ...
 
     @classmethod
-    def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False):
+    def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False):
         """Allocate a new CUDA array.
 
         Parameters
@@ -53,7 +53,7 @@ class CUDAArray:
             Element format.
         num_channels : int
             Channels per element. Must be 1, 2, or 4.
-        surface_load_store : bool
+        is_surface_load_store : bool
             If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
             can be bound as a :class:`SurfaceObject` for kernel-side writes.
             Default False.
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index d17129e275b..851f8cb9bf0 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -217,7 +217,7 @@ cdef class CUDAArray:
         )
 
     @classmethod
-    def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False):
+    def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False):
         """Allocate a new CUDA array.
 
         Parameters
@@ -229,7 +229,7 @@ cdef class CUDAArray:
             Element format.
         num_channels : int
             Channels per element. Must be 1, 2, or 4.
-        surface_load_store : bool
+        is_surface_load_store : bool
             If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
             can be bound as a :class:`SurfaceObject` for kernel-side writes.
             Default False.
@@ -258,7 +258,7 @@ cdef class CUDAArray:
         self._shape = shape_t
         self._format = <cydriver.CUarray_format><int>format
         self._num_channels = num_channels
-        self._surface_load_store = bool(surface_load_store)
+        self._surface_load_store = bool(is_surface_load_store)
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
         self._parent_ref = None
@@ -268,7 +268,7 @@ cdef class CUDAArray:
         cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d
         cdef int rank = len(shape_t)
         cdef unsigned int flags = (
-            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0
         )
 
         # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyi b/cuda_core/cuda/core/_mipmapped_array.pyi
index 06ca4615c40..20460037aa6 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyi
+++ b/cuda_core/cuda/core/_mipmapped_array.pyi
@@ -27,7 +27,7 @@ class MipmappedArray:
         ...
 
     @classmethod
-    def from_descriptor(cls, *, shape, format, num_channels, num_levels, surface_load_store=False):
+    def from_descriptor(cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False):
         """Allocate a new mipmapped CUDA array.
 
         Parameters
@@ -43,7 +43,7 @@ class MipmappedArray:
             Number of mip levels to allocate; must be >= 1. The driver caps
             this at the log2 of the largest dimension; passing a larger value
             yields a driver error.
-        surface_load_store : bool
+        is_surface_load_store : bool
             If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
             levels (obtained via :meth:`get_level`) can be bound as
             :class:`SurfaceObject` for kernel-side writes. Default False.
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
index a8a308933f0..cbf6d70732c 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyx
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -38,7 +38,7 @@ cdef class MipmappedArray:
 
     @classmethod
     def from_descriptor(
-        cls, *, shape, format, num_channels, num_levels, surface_load_store=False
+        cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False
     ):
         """Allocate a new mipmapped CUDA array.
 
@@ -55,7 +55,7 @@ cdef class MipmappedArray:
             Number of mip levels to allocate; must be >= 1. The driver caps
             this at the log2 of the largest dimension; passing a larger value
             yields a driver error.
-        surface_load_store : bool
+        is_surface_load_store : bool
             If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
             levels (obtained via :meth:`get_level`) can be bound as
             :class:`SurfaceObject` for kernel-side writes. Default False.
@@ -89,7 +89,7 @@ cdef class MipmappedArray:
         self._format = <cydriver.CUarray_format><int>format
         self._num_channels = num_channels
         self._num_levels = <unsigned int>levels
-        self._surface_load_store = bool(surface_load_store)
+        self._surface_load_store = bool(is_surface_load_store)
         self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
@@ -97,7 +97,7 @@ cdef class MipmappedArray:
         cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
         cdef int rank = len(shape_t)
         cdef unsigned int flags = (
-            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0
         )
         cdef unsigned int c_levels = <unsigned int>levels
 
diff --git a/cuda_core/cuda/core/_surface.pyi b/cuda_core/cuda/core/_surface.pyi
index 8961f8ce82a..9f86054a49c 100644
--- a/cuda_core/cuda/core/_surface.pyi
+++ b/cuda_core/cuda/core/_surface.pyi
@@ -11,7 +11,7 @@ class SurfaceObject:
     kernels read and write through it using integer pixel coordinates.
 
     The backing :class:`CUDAArray` must have been created with
-    ``surface_load_store=True`` and is kept alive for the lifetime of this
+    ``is_surface_load_store=True`` and is kept alive for the lifetime of this
     object to prevent dangling handles.
 
     Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
@@ -28,7 +28,7 @@ class SurfaceObject:
     def from_array(cls, array):
         """Create a surface object directly from an :class:`CUDAArray`.
 
-        The array must have been created with ``surface_load_store=True``.
+        The array must have been created with ``is_surface_load_store=True``.
         """
 
     @classmethod
@@ -39,7 +39,7 @@ class SurfaceObject:
         ----------
         resource : ResourceDescriptor
             Must wrap an :class:`CUDAArray` allocated with
-            ``surface_load_store=True``. Linear/pitch2d resources are not
+            ``is_surface_load_store=True``. Linear/pitch2d resources are not
             valid surface backings.
         """
 
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
index 2fdd43efd74..383f99f0218 100644
--- a/cuda_core/cuda/core/_surface.pyx
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -25,7 +25,7 @@ cdef class SurfaceObject:
     kernels read and write through it using integer pixel coordinates.
 
     The backing :class:`CUDAArray` must have been created with
-    ``surface_load_store=True`` and is kept alive for the lifetime of this
+    ``is_surface_load_store=True`` and is kept alive for the lifetime of this
     object to prevent dangling handles.
 
     Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
@@ -42,7 +42,7 @@ cdef class SurfaceObject:
     def from_array(cls, array):
         """Create a surface object directly from an :class:`CUDAArray`.
 
-        The array must have been created with ``surface_load_store=True``.
+        The array must have been created with ``is_surface_load_store=True``.
         """
         if not isinstance(array, CUDAArray):
             raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}")
@@ -56,7 +56,7 @@ cdef class SurfaceObject:
         ----------
         resource : ResourceDescriptor
             Must wrap an :class:`CUDAArray` allocated with
-            ``surface_load_store=True``. Linear/pitch2d resources are not
+            ``is_surface_load_store=True``. Linear/pitch2d resources are not
             valid surface backings.
         """
         if not isinstance(resource, ResourceDescriptor):
@@ -73,7 +73,7 @@ cdef class SurfaceObject:
         cdef CUDAArray arr = <CUDAArray>resource.source
         if not arr.is_surface_load_store:
             raise ValueError(
-                "CUDAArray must be created with surface_load_store=True to be "
+                "CUDAArray must be created with is_surface_load_store=True to be "
                 "bound as a SurfaceObject"
             )
 
diff --git a/cuda_core/cuda/core/_texture.pyi b/cuda_core/cuda/core/_texture.pyi
index 1365597e6e4..4f8543a00d0 100644
--- a/cuda_core/cuda/core/_texture.pyi
+++ b/cuda_core/cuda/core/_texture.pyi
@@ -46,7 +46,7 @@ class ResourceDescriptor:
       Supports filtering and 2D addressing, but only 2D access.
 
     Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
-    require an :class:`CUDAArray` allocated with ``surface_load_store=True``.
+    require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``.
     """
     __slots__ = ('_kind', '_source', '_format', '_num_channels', '_size_bytes', '_width', '_height', '_pitch_bytes')
 
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 18d828480a3..7b1e7301c98 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -72,7 +72,7 @@ class ResourceDescriptor:
       Supports filtering and 2D addressing, but only 2D access.
 
     Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
-    require an :class:`CUDAArray` allocated with ``surface_load_store=True``.
+    require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``.
     """
 
     __slots__ = (
diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
index 21b861c442c..ad9008757eb 100644
--- a/cuda_core/examples/gl_interop_fire.py
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -356,13 +356,13 @@ def make_heat_arrays():
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.UINT8,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.UINT8,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     return arr_a, arr_b
 
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
index 2d44e470253..ea2d8dc36ae 100644
--- a/cuda_core/examples/gl_interop_lenia.py
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -18,7 +18,7 @@
 # =========================
 # - How to drive a wide-radius convolution from a TextureObject configured for
 #   LINEAR + WRAP + normalized coordinates. The same CUDAArray is then bound as a
-#   SurfaceObject for the typed write back, requiring `surface_load_store=True`
+#   SurfaceObject for the typed write back, requiring `is_surface_load_store=True`
 #   at allocation time.
 # - How a single-channel `float` CUDAArray differs from the multi-channel layout
 #   used in the Gray-Scott example: `num_channels=1`, `tex2D<float>` reads, and
@@ -405,20 +405,20 @@ def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
 def make_state_arrays():
     """Allocate the two single-channel `float` ping-pong arrays.
 
-    `surface_load_store=True` is what lets the same CUDAArray be bound as both a
+    `is_surface_load_store=True` is what lets the same CUDAArray be bound as both a
     TextureObject (sampled reads) and a SurfaceObject (typed writes).
     """
     arr_a = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     return arr_a, arr_b
 
@@ -485,7 +485,7 @@ def main():
     resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
 
     # --- Step 6: Allocate the two ping-pong state Arrays ---
-    #     Both are single-channel `float` with `surface_load_store=True` so
+    #     Both are single-channel `float` with `is_surface_load_store=True` so
     #     they can be bound as SurfaceObjects.
     arr_a, arr_b = make_state_arrays()
 
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
index 227ce5c5f65..9f71bad7a5c 100644
--- a/cuda_core/examples/gl_interop_mipmap_lod.py
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -50,7 +50,7 @@
 #
 #   STARTUP -- one-time mipmap build
 #   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. Allocate MipmappedArray (10 levels, float4 RGBA, surface_load_store=True).
+#   1. Allocate MipmappedArray (10 levels, float4 RGBA, is_surface_load_store=True).
 #   2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency
 #      procedural pattern.
 #   3. For L = 1..num_levels-1: launch `downsample` kernel:
@@ -404,14 +404,14 @@ def main():
     dev, stream, kernels, _arch = setup_cuda()
 
     # --- Step 2: Allocate the mipmap pyramid and build every level ---
-    #     surface_load_store=True is required for kernel-side writes.
+    #     is_surface_load_store=True is required for kernel-side writes.
     num_levels = int(math.log2(BASE_SIZE)) + 1
     mip = MipmappedArray.from_descriptor(
         shape=(BASE_SIZE, BASE_SIZE),
         format=ArrayFormat.FLOAT32,
         num_channels=4,
         num_levels=num_levels,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     build_mipmap_pyramid(mip, num_levels, stream, kernels)
 
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
index 1ededb081a8..2e01dd9cccf 100644
--- a/cuda_core/examples/gl_interop_ocean.py
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -346,7 +346,7 @@ def make_heightmap_array():
         shape=(GRID, GRID),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
 
 
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
index dc34b213bd1..2c53f39f641 100644
--- a/cuda_core/examples/gl_interop_reaction_diffusion.py
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -15,7 +15,7 @@
 
 # What this example teaches
 # =========================
-# - How to allocate a CUDA CUDAArray with `surface_load_store=True` so the same
+# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same
 #   memory can be bound as both a TextureObject (for sampled reads) and a
 #   SurfaceObject (for typed writes).
 # - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates
@@ -351,13 +351,13 @@ def make_state_arrays():
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=2,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     arr_b = CUDAArray.from_descriptor(
         shape=(WIDTH, HEIGHT),
         format=ArrayFormat.FLOAT32,
         num_channels=2,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     return arr_a, arr_b
 
@@ -418,7 +418,7 @@ def main():
 
     # --- Step 6: Allocate the two ping-pong state Arrays ---
     #     Both are `float2` (channel 0 = U, channel 1 = V) with
-    #     surface_load_store=True so they can be bound as SurfaceObjects.
+    #     is_surface_load_store=True so they can be bound as SurfaceObjects.
     arr_a, arr_b = make_state_arrays()
 
     # --- Step 7: Pre-create the four bindless handles ---
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
index 7fbf8dcd1fe..20ecadb2244 100644
--- a/cuda_core/examples/gl_interop_sdf_volume.py
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -44,7 +44,7 @@
 #
 #   STARTUP (one-shot bake)
 #   ~~~~~~~~~~~~~~~~~~~~~~~
-#   1. Allocate 3D CUDAArray (128^3, FLOAT32 x1, surface_load_store=True).
+#   1. Allocate 3D CUDAArray (128^3, FLOAT32 x1, is_surface_load_store=True).
 #   2. Bind it as a SurfaceObject.
 #   3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite.
 #   4. Close the SurfaceObject; the CUDAArray stays alive.
@@ -161,7 +161,7 @@ def make_volume_array():
         shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
         format=ArrayFormat.FLOAT32,
         num_channels=1,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
 
 
@@ -546,7 +546,7 @@ def on_close():
 //           into a single-channel float 3D CUDAArray via a SurfaceObject.
 //
 //   surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with
-//   surface_load_store=True.
+//   is_surface_load_store=True.
 //   surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float));
 //   y and z are in elements. Off-by-one on the byte conversion silently
 //   corrupts every other column, so it's worth flagging explicitly.
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
index 46337ebbab3..d111a477232 100644
--- a/cuda_core/tests/test_texture_surface.py
+++ b/cuda_core/tests/test_texture_surface.py
@@ -61,7 +61,7 @@ def test_array_3d_with_surface_flag(init_cuda):
         shape=(8, 8, 4),
         format=ArrayFormat.UINT8,
         num_channels=4,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     try:
         assert arr.shape == (8, 8, 4)
@@ -162,7 +162,7 @@ def test_surface_object_create(init_cuda):
         shape=(8, 8),
         format=ArrayFormat.UINT8,
         num_channels=4,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     try:
         surf = SurfaceObject.from_array(arr)
@@ -178,7 +178,7 @@ def test_surface_object_create(init_cuda):
 def test_surface_requires_ldst_flag(init_cuda):
     arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4)
     try:
-        with pytest.raises(ValueError, match="surface_load_store=True"):
+        with pytest.raises(ValueError, match="is_surface_load_store=True"):
             SurfaceObject.from_array(arr)
     finally:
         arr.close()
@@ -560,7 +560,7 @@ def test_surface_rejects_mipmapped_array(init_cuda):
         format=ArrayFormat.UINT8,
         num_channels=4,
         num_levels=2,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     try:
         res = ResourceDescriptor.from_mipmapped_array(mip)
@@ -868,7 +868,7 @@ def test_surface_object_keeps_backing_array_alive(init_cuda):
         shape=(8, 8),
         format=ArrayFormat.UINT8,
         num_channels=4,
-        surface_load_store=True,
+        is_surface_load_store=True,
     )
     surf = SurfaceObject.from_array(arr)
     arr_id = id(arr)

From 5673ddb4346cb8ebe6b7e5e0fd65f4f4f689bfc9 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 10 Jun 2026 15:44:35 -0700
Subject: [PATCH 14/17] cuda.core: add 7 texture/surface graphics examples

Add seven new cuda.core GL-interop examples exercising the new
CUDAArray / MipmappedArray / SurfaceObject / TextureObject /
GraphicsResource APIs, each centered on a distinct feature and verified
on-GPU:

- gl_interop_fluid.py       Stable-Fluids ink: LINEAR advection, float4
                            dye, frame-rate-independent stepping
- gl_interop_physarum.py    slime-mold: Buffer agents, surface deposit /
                            texture sense, direction-hued veins
- gl_interop_clouds.py      3D CUDAArray + tex3D trilinear volumetric
                            raymarch with HG forward-scattering
- gl_interop_particles.py   VBO interop (from_gl_buffer) + baked
                            curl-noise TextureObject, additive points
- gl_interop_bloom.py       MipmappedArray get_level + per-level surface
                            downsample + tex2DLod composite, live LOD
- gl_interop_jfa_voronoi.py POINT-filtered JFA, AddressMode.BORDER +
                            border_color sentinel
- gl_interop_caustics.py    UINT8 background sampled LINEAR + MIRROR +
                            sRGB + max_anisotropy, chromatic dispersion

Each example documents which cuda.core APIs it uses via a code->API
comment map and a live config string in the window caption. All seven
are registered (display-gated) in test_basic_examples.py.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/examples/gl_interop_bloom.py        |  793 +++++++++++
 cuda_core/examples/gl_interop_caustics.py     |  745 ++++++++++
 cuda_core/examples/gl_interop_clouds.py       |  991 +++++++++++++
 cuda_core/examples/gl_interop_fluid.py        | 1251 +++++++++++++++++
 cuda_core/examples/gl_interop_jfa_voronoi.py  |  940 +++++++++++++
 cuda_core/examples/gl_interop_particles.py    |  688 +++++++++
 cuda_core/examples/gl_interop_physarum.py     |  889 ++++++++++++
 .../example_tests/test_basic_examples.py      |    7 +
 8 files changed, 6304 insertions(+)
 create mode 100644 cuda_core/examples/gl_interop_bloom.py
 create mode 100644 cuda_core/examples/gl_interop_caustics.py
 create mode 100644 cuda_core/examples/gl_interop_clouds.py
 create mode 100644 cuda_core/examples/gl_interop_fluid.py
 create mode 100644 cuda_core/examples/gl_interop_jfa_voronoi.py
 create mode 100644 cuda_core/examples/gl_interop_particles.py
 create mode 100644 cuda_core/examples/gl_interop_physarum.py

diff --git a/cuda_core/examples/gl_interop_bloom.py b/cuda_core/examples/gl_interop_bloom.py
new file mode 100644
index 00000000000..66fa95f1f61
--- /dev/null
+++ b/cuda_core/examples/gl_interop_bloom.py
@@ -0,0 +1,793 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the cuda.core texture/surface stack used to build a
+# bloom / glow post-effect entirely on the GPU. An animated HDR-ish scene is
+# rendered into the base level of a MipmappedArray; the mip pyramid is then
+# built level by level via SurfaceObject writes (each level reads the one above
+# through its own LINEAR TextureObject); finally a single mipmapped
+# TextureObject samples several LODs with tex2DLod to composite a soft bloom on
+# top of the sharp scene. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# The least-demonstrated corner of the texture/surface API: the two halves of a
+# mip pyramid round-trip.
+#
+# - BUILD side: MipmappedArray.get_level(i) returns a NON-OWNING CUDAArray view
+#   of level i. Bind each level as its own SurfaceObject and have a kernel write
+#   into it. We downsample by reading level i-1 through a per-level LINEAR
+#   TextureObject (one bilinear tap == a 2x2 box average) and storing into
+#   level i through that level's SurfaceObject. This is a mip chain built
+#   *on the GPU*, not by the driver.
+# - SAMPLE side: ONE mipmapped TextureObject (FilterMode.LINEAR +
+#   mipmap_filter_mode=LINEAR, normalized coords) bound to the whole pyramid via
+#   ResourceDescriptor.from_mipmapped_array lets a single tex2DLod<float4> read
+#   any level -- the blurred coarse levels are exactly the glow.
+#
+# How it works
+# ============
+# Bloom is "blur the bright parts, add them back." A mip pyramid is a ready-made
+# multi-scale blur: each coarser level is a halved, box-filtered copy of the
+# level below, so reading a high LOD is reading a heavily blurred image.
+#
+#     level 0: 512 x 512   <- sharp animated scene (the emitters)
+#     level 1: 256 x 256       (downsampled via SurfaceObject write)
+#     level 2: 128 x 128
+#     ...
+#     level L-1: small        <- the softest, widest glow
+#
+#   PER FRAME (render loop)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. render_scene  -- writes an animated scene of moving bright emitters into
+#                       level 0 through its SurfaceObject (float4 RGBA, values
+#                       can exceed 1.0 in the hot spots).
+#   2. downsample    -- for i in 1..L-1, read level i-1 through its LINEAR
+#                       TextureObject and write level i through its
+#                       SurfaceObject. A single LINEAR tap at the midpoint of
+#                       the parent's 2x2 footprint *is* the box average.
+#   3. composite     -- one mipmapped TextureObject; tex2DLod at lod 0 gives the
+#                       sharp scene, and a weighted sum of lods 1..L-1 gives the
+#                       bloom. Tonemap with 1 - exp(-c*x) and write RGBA8 to the
+#                       OpenGL PBO.
+#
+#   surf2Dwrite indexes x in BYTES, so a float4 write uses x * sizeof(float4)
+#   (= x * 16). Getting this wrong silently corrupts every fourth column.
+#
+# What you should see
+# ===================
+# Several colored emitters orbiting on a dark background, each wrapped in a soft
+# glow. Bright cores bleed light into their surroundings.
+#
+#   +  /  =           bloom strength += 0.15
+#   -                 bloom strength -= 0.15
+#   [                 bloom threshold -= 0.05 (more of the scene glows)
+#   ]                 bloom threshold += 0.05 (only the brightest glow)
+#   ,  /  .           mipmap_level_bias -= / += 0.25 (sharper / softer glow)
+#   ;  /  '           LODs summed -= / += 1 (the live max-LOD clamp)
+#   B                 toggle bloom on / off (makes the effect obvious)
+#   R                 reset all controls
+#   Escape / close    quit
+#
+# The window title shows FPS plus the live mipmap LOD-selection config
+# (MipmappedArray level count, trilinear tex2DLod bias / clamp / LODs) and the
+# bloom strength, threshold, and on/off state.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    MipmappedArray,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BASE_SIZE = 512  # Mip base-level edge length (power of two so levels halve cleanly).
+MAX_LEVELS = 7  # Modest cap on pyramid depth; bounded by log2(BASE_SIZE)+1.
+NUM_EMITTERS = 7
+
+BLOOM_STRENGTH_STEP = 0.15
+BLOOM_THRESHOLD_STEP = 0.05
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA, OpenGL, and the mip pyramid. If you're here
+# to learn about MipmappedArray / per-level SurfaceObject writes / mipmapped
+# TextureObject sampling, skip straight to main() -- the interesting part is
+# there. These helpers keep main() reading like a short story.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """Surface load/store + mipmapped arrays require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the three kernels and return (device, stream, kernels).
+
+    kernels is a dict with keys "render_scene", "downsample", "composite".
+    """
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("render_scene", "downsample", "composite"),
+    )
+    kernels = {
+        "render_scene": mod.get_kernel("render_scene"),
+        "downsample": mod.get_kernel("downsample"),
+        "composite": mod.get_kernel("composite"),
+    }
+    return dev, stream, kernels
+
+
+def make_level_grid(level_size, block):
+    """2D launch grid covering a (level_size x level_size) image."""
+    return (
+        (level_size + block[0] - 1) // block[0],
+        (level_size + block[1] - 1) // block[1],
+        1,
+    )
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core MipmappedArray - GPU mip-pyramid bloom",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: a shader program, a fullscreen quad, and an
+    empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the mip pyramid (single allocation, all levels) ---
+    #     is_surface_load_store=True is required so each level can back a
+    #     SurfaceObject for kernel-side writes. We cap the depth at MAX_LEVELS;
+    #     each level halves until 1x1 at most.
+    num_levels = min(int(math.log2(BASE_SIZE)) + 1, MAX_LEVELS)
+    mm = MipmappedArray.from_descriptor(
+        shape=(BASE_SIZE, BASE_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        num_levels=num_levels,
+        is_surface_load_store=True,
+    )
+
+    # --- Step 3: Pre-create per-level handles ONCE and keep them alive ---
+    #     For every level we build a SurfaceObject (to write into it) and a
+    #     non-mipmapped LINEAR TextureObject (so the downsample kernel can read
+    #     the level above with hardware bilinear). get_level(i) returns a
+    #     NON-OWNING view -- the storage belongs to `mm`, which we keep alive.
+    #     Building these per-frame would be wasteful and, worse, a handle closed
+    #     before its async launch runs would dangle.
+    level_sizes = [BASE_SIZE >> i for i in range(num_levels)]
+    level_arrays = [mm.get_level(i) for i in range(num_levels)]  # keep views alive
+
+    src_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,  # one bilinear tap == 2x2 box average
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,  # integer/pixel coordinates for the box tap
+    )
+    level_surfaces = [SurfaceObject.from_array(arr) for arr in level_arrays]
+    level_textures = [
+        TextureObject.from_descriptor(
+            resource=ResourceDescriptor.from_array(arr),
+            texture_descriptor=src_tex_desc,
+        )
+        for arr in level_arrays
+    ]
+
+    # --- Step 4: One mipmapped TextureObject over the WHOLE pyramid ---
+    #     This is the sample side: tex2DLod can fetch any LOD from it, so the
+    #     composite kernel reads the sharp scene (lod 0) and the blurred glow
+    #     (lods 1..L-1) through this single handle. WRAP/MIRROR need normalized
+    #     coords; we use CLAMP + normalized so a level's edge does not bleed in.
+    #
+    #   API MAP -- the mip pyramid round-trip
+    #   =====================================
+    #   BUILD on the GPU:   MipmappedArray.from_descriptor(...) allocates the
+    #                       whole chain; mm.get_level(i) hands back a NON-OWNING
+    #                       CUDAArray view of each level that we bind to a
+    #                       per-level SurfaceObject and write into (the loop in
+    #                       on_draw). The driver never builds the mips -- we do.
+    #   READ it back:       ResourceDescriptor.from_mipmapped_array(mm) wraps the
+    #                       SAME chain in ONE mipmapped TextureObject. tex2DLod
+    #                       then samples any LOD with trilinear filtering.
+    #   LOD selection knobs (TextureDescriptor):
+    #     mipmap_filter_mode=LINEAR  -> trilinear: blend BETWEEN the two nearest
+    #                                   integer LODs (vs NEAREST = snap to one).
+    #     mipmap_level_bias          -> constant added to the requested LOD.
+    #     min/max_mipmap_level_clamp -> clamp the effective LOD to a range.
+    #   These descriptor fields are baked at construction (the texture is created
+    #   ONCE, per the invariants). To demonstrate them INTERACTIVELY, the
+    #   composite kernel folds the SAME bias/clamp math into its explicit
+    #   tex2DLod `lod` argument -- live keys move bias / max-LOD without ever
+    #   rebuilding the texture, while the descriptor encodes the static defaults.
+    mip_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+        mipmap_filter_mode=FilterMode.LINEAR,  # trilinear between levels
+        mipmap_level_bias=0.0,
+        min_mipmap_level_clamp=0.0,
+        max_mipmap_level_clamp=float(num_levels - 1),
+    )
+    mip_tex = TextureObject.from_descriptor(
+        resource=ResourceDescriptor.from_mipmapped_array(mm),
+        texture_descriptor=mip_tex_desc,
+    )
+
+    # --- Step 5: Open a window and set up the GL/CUDA bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Render loop state + launch configs ---
+    state = {
+        "strength": 1.8,  # bloom intensity multiplier
+        "threshold": 0.6,  # only luminance above this contributes to bloom
+        "bloom_on": True,
+        # --- Live LOD-selection controls (folded into the tex2DLod loop) ---
+        "bias": 0.5,  # mipmap_level_bias added to each bloom tap's LOD
+        "num_lods": max(1, num_levels - 1),  # how many LODs the bloom sums
+        "min_clamp": 0.0,  # min_mipmap_level_clamp (shown; static default)
+    }
+    max_clamp = float(num_levels - 1)  # max_mipmap_level_clamp ceiling
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+
+    block = (16, 16, 1)
+    # The composite kernel covers the WIDTHxHEIGHT screen.
+    composite_config = LaunchConfig(grid=make_level_grid_screen(block), block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+        t = time.monotonic() - start_time
+
+        # (a) Render the animated HDR-ish scene into level 0's surface.
+        launch(
+            stream,
+            LaunchConfig(grid=make_level_grid(BASE_SIZE, block), block=block),
+            kernels["render_scene"],
+            np.uint64(level_surfaces[0].handle),
+            np.int32(BASE_SIZE),
+            np.int32(BASE_SIZE),
+            np.float32(t),
+            np.int32(NUM_EMITTERS),
+        )
+
+        # (b) Build the pyramid on the GPU: each level i reads level i-1 via its
+        #     LINEAR TextureObject and writes level i via its SurfaceObject.
+        for i in range(1, num_levels):
+            dst_size = level_sizes[i]
+            launch(
+                stream,
+                LaunchConfig(grid=make_level_grid(dst_size, block), block=block),
+                kernels["downsample"],
+                np.uint64(level_textures[i - 1].handle),  # read parent level
+                np.uint64(level_surfaces[i].handle),  # write this level
+                np.int32(dst_size),
+            )
+
+        # (c) Composite: one mipmapped texture, sample several LODs, tonemap,
+        #     and write RGBA8 straight into the PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                composite_config,
+                kernels["composite"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(mip_tex.handle),
+                np.float32(state["strength"]),
+                np.float32(state["threshold"]),
+                np.int32(state["num_lods"]),  # # of bloom LODs summed (max-clamp)
+                np.float32(state["bias"]),  # mipmap_level_bias folded into tex2DLod
+                np.float32(max_clamp),  # max_mipmap_level_clamp ceiling
+                np.int32(1 if state["bloom_on"] else 0),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 1.0:
+            fps = frame_count[0] / (now - fps_time[0])
+            window.set_caption(
+                f"GPU mip-pyramid bloom ({WIDTH}x{HEIGHT}, {fps:.0f} FPS) | "
+                f"MipmappedArray[{num_levels} lvls] + tex2DLod[trilinear, "
+                f"bias={state['bias']:+.2f}, "
+                f"clamp={state['min_clamp']:.0f}..{max_clamp:.0f}, "
+                f"lods={state['num_lods']}] | "
+                f"bloom={state['strength']:.2f} "
+                f"thr={state['threshold']:.2f} "
+                f"{'ON' if state['bloom_on'] else 'OFF'}"
+            )
+            frame_count[0] = 0
+            fps_time[0] = now
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["strength"] = min(8.0, state["strength"] + BLOOM_STRENGTH_STEP)
+        elif symbol in (key.MINUS, key.NUM_SUBTRACT):
+            state["strength"] = max(0.0, state["strength"] - BLOOM_STRENGTH_STEP)
+        elif symbol == key.BRACKETLEFT:
+            state["threshold"] = max(0.0, state["threshold"] - BLOOM_THRESHOLD_STEP)
+        elif symbol == key.BRACKETRIGHT:
+            state["threshold"] = min(4.0, state["threshold"] + BLOOM_THRESHOLD_STEP)
+        elif symbol == key.COMMA:
+            state["bias"] = max(-float(num_levels - 1), state["bias"] - 0.25)
+        elif symbol == key.PERIOD:
+            state["bias"] = min(float(num_levels - 1), state["bias"] + 0.25)
+        elif symbol == key.SEMICOLON:
+            state["num_lods"] = max(1, state["num_lods"] - 1)
+        elif symbol == key.APOSTROPHE:
+            state["num_lods"] = min(num_levels - 1, state["num_lods"] + 1)
+        elif symbol == key.B:
+            state["bloom_on"] = not state["bloom_on"]
+        elif symbol == key.R:
+            state["strength"] = 1.8
+            state["threshold"] = 0.6
+            state["bloom_on"] = True
+            state["bias"] = 0.5
+            state["num_lods"] = max(1, num_levels - 1)
+
+    @window.event
+    def on_close():
+        # Release CUDA-side resources in reverse construction order. GL objects
+        # clean up via pyglet on window close. `mm` is closed LAST because the
+        # per-level surfaces/textures reference its (non-owning) level views.
+        resource.close()
+        mip_tex.close()
+        for tex in level_textures:
+            tex.close()
+        for surf in level_surfaces:
+            surf.close()
+        mm.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+def make_level_grid_screen(block):
+    """2D launch grid covering the WIDTH x HEIGHT screen."""
+    return (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Three CUDA kernels are concatenated into one program string so they share a
+# single NVRTC compile. All three operate on float4 RGBA pixels.
+#
+#   render_scene -- writes an animated scene of moving bright emitters into mip
+#                   level 0 via a SurfaceObject. Hot cores exceed 1.0 so the
+#                   bloom has something to bleed. NOTE: surf2Dwrite's x is in
+#                   BYTES, so we multiply by sizeof(float4) (= 16).
+#
+#   downsample   -- reads level L-1 through a LINEAR TextureObject and writes
+#                   level L through a SurfaceObject. With LINEAR filtering and
+#                   non-normalized coords, ONE tap at the midpoint of the
+#                   parent's 2x2 footprint -- (2x + 1.0, 2y + 1.0) -- equals the
+#                   4-texel box average. (A POINT-sampled +0.5 offset would be
+#                   a single texel, NOT the average; the +1.0 midpoint is the
+#                   crux of this example.)
+#
+#   composite    -- samples the WHOLE pyramid through one mipmapped texture.
+#                   tex2DLod(...,0) is the sharp scene; a weighted sum of
+#                   tex2DLod(...,lod) for lod 1..maxLod is the blurred glow.
+#                   We threshold the glow's luminance, scale by `strength`,
+#                   add the sharp scene, tonemap with 1-exp(-x), write RGBA8.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float luminance(float4 c) {
+    return 0.2126f * c.x + 0.7152f * c.y + 0.0722f * c.z;
+}
+
+// --------------------------------------------------------------------------
+// render_scene: animated bright emitters on a dark background -> level 0.
+//
+// `surf` is a SurfaceObject bound to mip level 0 (float4 RGBA). Each emitter
+// orbits the center and contributes a sharp colored core whose intensity can
+// exceed 1.0, giving the bloom pass something to bleed.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_scene(cudaSurfaceObject_t surf, int width, int height,
+                  float t, int num_emitters) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Faint moving background wash so the frame is never fully black.
+    float bg = 0.04f + 0.02f * sinf(6.2831853f * (u + v) + t * 0.5f);
+    float3 color = make_float3(bg * 0.4f, bg * 0.5f, bg * 0.9f);
+
+    // Accumulate emitters: each orbits the center on its own radius/phase.
+    for (int i = 0; i < num_emitters; ++i) {
+        float fi = (float)i;
+        float phase = t * (0.4f + 0.12f * fi) + fi * 2.3998f;  // golden-ish spread
+        float radius = 0.18f + 0.06f * fi / fmaxf(1.0f, (float)(num_emitters - 1));
+        float ex = 0.5f + radius * cosf(phase);
+        float ey = 0.5f + radius * sinf(phase * 1.13f);
+
+        float dx = u - ex;
+        float dy = v - ey;
+        float d2 = dx * dx + dy * dy;
+
+        // Tight bright core (Gaussian) plus a gentle per-emitter pulse so the
+        // HDR peak breathes and the bloom halo visibly swells. 1/sigma^2 sets
+        // the core size; the smaller multiplier here widens the hot spot a bit
+        // so coarse LODs pick up plenty of energy to bleed.
+        float pulse = 0.75f + 0.25f * sinf(t * (1.3f + 0.17f * fi) + fi);
+        float core = expf(-d2 * 3200.0f);
+        float hot = 3.0f * pulse * core;  // peak well above 1.0 -> blooms strongly
+
+        // Per-emitter hue cycling through R/G/B-ish triplets.
+        float hue = fi * 1.0471975f + t * 0.2f;  // 60 deg steps + slow drift
+        float3 tint = make_float3(
+            0.5f + 0.5f * sinf(hue),
+            0.5f + 0.5f * sinf(hue + 2.0943951f),
+            0.5f + 0.5f * sinf(hue + 4.1887902f));
+
+        color.x += hot * tint.x;
+        color.y += hot * tint.y;
+        color.z += hot * tint.z;
+    }
+
+    float4 px = make_float4(color.x, color.y, color.z, 1.0f);
+
+    // surf2Dwrite indexes x in BYTES: float4 is 16 bytes.
+    surf2Dwrite<float4>(px, surf, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// downsample: halve the parent level into this level via a single LINEAR tap.
+//
+// `src` is a LINEAR-filtered TextureObject bound to the parent level (L-1).
+// `dst` is a SurfaceObject bound to this level (L). dst_size is L's edge.
+//
+// With non-normalized coords, tex2D returns texel (i,j) when sampled at
+// (i+0.5, j+0.5). For output texel (x,y) the parent 2x2 footprint covers
+// parent texels (2x,2y), (2x+1,2y), (2x,2y+1), (2x+1,2y+1). The midpoint of
+// those four centers is (2x+1.0, 2y+1.0); LINEAR filtering there blends all
+// four at weight 0.25 each -- exactly the box average. (NOT +0.5, which would
+// land on one texel center and return a single texel.)
+// --------------------------------------------------------------------------
+extern "C" __global__
+void downsample(cudaTextureObject_t src,
+                cudaSurfaceObject_t dst,
+                int dst_size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_size || y >= dst_size) return;
+
+    float fx = 2.0f * (float)x + 1.0f;
+    float fy = 2.0f * (float)y + 1.0f;
+
+    float4 px = tex2D<float4>(src, fx, fy);
+
+    surf2Dwrite<float4>(px, dst, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// composite: sharp scene + multi-LOD bloom, tonemapped, into the PBO.
+//
+// `mip_tex` is ONE mipmapped TextureObject over the whole pyramid. tex2DLod at
+// lod 0 is the sharp scene; lods 1..max_lod are progressively blurrier copies
+// that form the glow. We threshold each blurred sample's luminance so only the
+// bright parts bloom, weight coarser (wider) levels a bit less, scale by
+// `strength`, add the sharp scene, and tonemap.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void composite(unsigned char *output,
+               int width,
+               int height,
+               cudaTextureObject_t mip_tex,
+               float strength,
+               float threshold,
+               int num_lods,
+               float bias,
+               float max_lod,
+               int bloom_on) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Sharp scene from the base level. The base sample stays at lod 0 -- bias is
+    // applied only to the bloom taps below, so the scene never blurs.
+    float4 scene = tex2DLod<float4>(mip_tex, u, v, 0.0f);
+    float3 hdr = make_float3(scene.x, scene.y, scene.z);
+
+    if (bloom_on) {
+        // Sum the blurred levels. Each coarser level covers a wider area, so we
+        // taper its weight to keep the glow soft rather than flat.
+        //
+        // This loop is where the live LOD-selection knobs live: `num_lods` is the
+        // max-clamp (how high up the pyramid we read), and `bias` is the
+        // mipmap_level_bias folded into the explicit tex2DLod `lod` argument.
+        // We clamp the effective LOD to [0, max_lod] so a positive bias can never
+        // index past the top of the pyramid.
+        float3 bloom = make_float3(0.0f, 0.0f, 0.0f);
+        float weight_sum = 0.0f;
+        for (int lod = 1; lod <= num_lods; ++lod) {
+            float eff_lod = clampf((float)lod + bias, 0.0f, max_lod);
+            float4 s = tex2DLod<float4>(mip_tex, u, v, eff_lod);
+            // Soft-knee threshold: keep only the energy above `threshold`.
+            float lum = luminance(s);
+            float excess = fmaxf(lum - threshold, 0.0f);
+            float keep = (lum > 1e-4f) ? (excess / lum) : 0.0f;
+
+            float w = 1.0f / (float)lod;  // finer blurred levels weigh more
+            bloom.x += w * keep * s.x;
+            bloom.y += w * keep * s.y;
+            bloom.z += w * keep * s.z;
+            weight_sum += w;
+        }
+        if (weight_sum > 0.0f) {
+            float inv = strength / weight_sum;
+            hdr.x += bloom.x * inv;
+            hdr.y += bloom.y * inv;
+            hdr.z += bloom.z * inv;
+        }
+    }
+
+    // Tonemap HDR -> [0,1] with a simple exposure curve, then to 8-bit.
+    float r = 1.0f - expf(-hdr.x);
+    float g = 1.0f - expf(-hdr.y);
+    float b = 1.0f - expf(-hdr.z);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(clampf(r, 0.0f, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(clampf(g, 0.0f, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(clampf(b, 0.0f, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_caustics.py b/cuda_core/examples/gl_interop_caustics.py
new file mode 100644
index 00000000000..35de14394d7
--- /dev/null
+++ b/cuda_core/examples/gl_interop_caustics.py
@@ -0,0 +1,745 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and
+# GraphicsResource for CUDA/OpenGL interop. A vivid procedural background image
+# is uploaded once into a 2D CUDAArray and bound as a TextureObject sampled with
+# FilterMode.LINEAR + AddressMode.MIRROR + normalized coordinates. Each frame a
+# `render_water` kernel evaluates an animated water surface analytically, refracts
+# the view ray through it to perturb the background lookup UVs, adds shimmering
+# caustic highlights, and writes RGBA8 straight into an OpenGL PBO. The effect is
+# "looking through a sunlit pool". Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to upload a host numpy image into a CUDAArray with `CUDAArray.copy_from`
+#   (host layout (H, W, 4) uint8 row-major for an array allocated as
+#   shape=(WIDTH, HEIGHT)) and bind it as a long-lived TextureObject.
+# - Why FilterMode.LINEAR + AddressMode.MIRROR + normalized_coords=True is the
+#   right pairing for a refraction effect: refracted UV lookups routinely fall
+#   slightly outside [0, 1], and MIRROR returns a sensible mirrored pixel rather
+#   than a clamped smear or a hard edge, while LINEAR keeps the warp smooth.
+# - Why srgb=True is the correct read mode for an 8-bit color image: the texels
+#   are decoded sRGB->linear on read, the kernel does its lighting and tonemap
+#   in linear light, then re-encodes to sRGB on output (the gamma-correct
+#   "sample in linear, tonemap, output" pipeline).
+# - Why max_anisotropy is justified here: refraction samples the texture at
+#   grazing, stretched angles, the case anisotropic filtering exists to clean
+#   up.
+# - That the animated water normal field is computed ANALYTICALLY in the kernel
+#   (a sum of moving directional sine waves plus a few expanding circular
+#   ripples), so there is no second CUDAArray and no SurfaceObject pass -- the
+#   normal and its curvature are evaluated per pixel from a `time` uniform.
+# - How to feed a small fixed ring of interactive click-ripples to the kernel
+#   purely as scalar launch arguments (the demonstrated launch convention),
+#   avoiding any custom device-buffer machinery.
+#
+# How it works
+# ============
+#   Startup (once):
+#     +-------------------+   copy_from   +-----------+
+#     | host numpy image  | ------------> | CUDAArray |  (UINT8 RGBA, vivid grid)
+#     +-------------------+               +-----+-----+
+#                                               |
+#                                               v
+#                                        +-------------+
+#                                        | TextureObj  |  LINEAR + MIRROR + norm
+#                                        +-------------+
+#
+#   Each frame (render_water kernel, 2D over the screen):
+#     1. Evaluate the water height/normal at this pixel from the analytic wave
+#        sum (directional waves + circular ripples) using the `time` uniform.
+#     2. Refract: offset the background sample UV by `refract` * (the water
+#        surface gradient) -- a cheap 2D approximation of bending the view ray.
+#     3. Sample the background TextureObject at the perturbed UV (LINEAR +
+#        MIRROR keeps it smooth and well-defined outside [0, 1]).
+#     4. Caustics: brightness focuses where wavefronts converge. Approximate
+#        with a sharpened power of the surface curvature (Laplacian), adding
+#        bright cyan/white highlights. Add a depth tint (deeper = bluer) and a
+#        specular sparkle from the normal versus a fixed light direction.
+#     5. Tonemap and write RGBA8 into the OpenGL PBO. No PCIe traffic per frame.
+#
+# Why MIRROR (not WRAP or CLAMP)?
+# -------------------------------
+# WRAP and MIRROR both require normalized coordinates. WRAP tiles the image, so
+# a refraction pushing past the right edge suddenly shows the far-left content
+# (a visible seam). CLAMP smears the edge texel into a streak. MIRROR reflects
+# the image at the boundary, which for a small refraction offset looks like the
+# pool simply continuing -- the most natural choice here.
+#
+# What you should see
+# ===================
+# A colorful tiled background rippling as if seen through moving water, with
+# bright caustic highlights skittering across it. Press +/- to change the
+# refraction/ripple strength, click anywhere to spawn an expanding circular
+# ripple at the cursor, and Escape to exit. The title shows FPS and the current
+# ripple strength.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BG_SIZE = 256  # the background CUDAArray is BG_SIZE x BG_SIZE RGBA8
+
+# Interactive click-ripples. We keep a small fixed ring and pass each slot to
+# the kernel as plain float scalars (matching the demonstrated launch
+# convention -- no custom device buffers). A ripple with start time < 0 is
+# inactive.
+MAX_RIPPLES = 3
+RIPPLE_LIFETIME = 4.0  # seconds before a click-ripple fully fades out
+
+DEFAULT_STRENGTH = 1.0
+STRENGTH_STEP = 0.15
+MIN_STRENGTH = 0.0
+MAX_STRENGTH = 3.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject, skip ahead to main() -- the interesting part is
+# there. These helpers exist so main() reads like a short story instead of a
+# wall of boilerplate.
+# ============================================================================
+
+
+def make_background_image(size):
+    """Build a (size, size, 4) uint8 RGBA background designed to show refraction.
+
+    Layout convention: CUDAArray.from_descriptor takes shape=(WIDTH, HEIGHT), so
+    the host buffer fed to copy_from must be H rows of W elements (row-major),
+    i.e. host.shape == (HEIGHT, WIDTH, 4). Here the image is square so the two
+    agree, but the (y, x) indexing below is the load-bearing part.
+
+    The pattern is deliberately vivid and high-frequency -- a grid of saturated
+    hues with concentric rings -- so even small refraction offsets are obvious.
+    """
+    ys, xs = np.mgrid[0:size, 0:size].astype(np.float32)
+    u = xs / size
+    v = ys / size
+
+    # Saturated, smoothly varying hues across the plane (a cheap HSV-ish wheel).
+    r = 0.5 + 0.5 * np.sin(u * 6.2831853 * 2.0 + 0.0)
+    g = 0.5 + 0.5 * np.sin(v * 6.2831853 * 2.0 + 2.0944)
+    b = 0.5 + 0.5 * np.sin((u + v) * 6.2831853 * 2.0 + 4.1888)
+
+    # Bright grid lines so the warp is legible.
+    cells = 8.0
+    gx = np.abs(((u * cells) % 1.0) - 0.5)
+    gy = np.abs(((v * cells) % 1.0) - 0.5)
+    grid = np.maximum(gx, gy)
+    grid_line = (grid > 0.42).astype(np.float32)
+    r = r * (1.0 - grid_line) + 1.0 * grid_line
+    g = g * (1.0 - grid_line) + 1.0 * grid_line
+    b = b * (1.0 - grid_line) + 1.0 * grid_line
+
+    # A couple of concentric rings centered on the image to add curvature cues.
+    cx, cy = 0.5, 0.5
+    dist = np.sqrt((u - cx) ** 2 + (v - cy) ** 2)
+    rings = 0.5 + 0.5 * np.sin(dist * 6.2831853 * 10.0)
+    r = np.clip(r * 0.75 + rings * 0.25, 0.0, 1.0)
+    g = np.clip(g * 0.75 + rings * 0.20, 0.0, 1.0)
+    b = np.clip(b * 0.85 + rings * 0.15, 0.0, 1.0)
+
+    img = np.zeros((size, size, 4), dtype=np.uint8)
+    img[:, :, 0] = (r * 255.0).astype(np.uint8)
+    img[:, :, 1] = (g * 255.0).astype(np.uint8)
+    img[:, :, 2] = (b * 255.0).astype(np.uint8)
+    img[:, :, 3] = 255
+    return img
+
+
+def setup_cuda():
+    """Compile the kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless texture objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("render_water",))
+    kernel = mod.get_kernel("render_water")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window. Returns (window, gl_module, pyglet_module)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray + TextureObject - Water Caustics",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard pyglet boilerplate: shader, fullscreen quad, screen texture."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create the GL PBO that CUDA writes RGBA pixels into each frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+MAX_ANISOTROPY = 8  # kept in lockstep with the API MAP comment + live caption
+
+
+def make_background_texture(arr):
+    """Bind `arr` as a TextureObject for LINEAR + MIRROR + normalized sampling.
+
+    MIRROR (like WRAP) requires normalized coordinates. UINT8 source +
+    NORMALIZED_FLOAT means tex2D<float4> returns each channel in [0, 1].
+
+    API MAP: UINT8 RGBA CUDAArray sampled as TextureObject[LINEAR | MIRROR |
+    NORMALIZED_FLOAT | srgb | max_anisotropy=8]; MIRROR handles refracted UVs
+    that leave [0,1]; srgb does the gamma-correct decode; anisotropy cleans up
+    grazing-angle sampling.
+
+    Two TextureDescriptor features are showcased here on an 8-bit color image:
+
+    - srgb=True: the background is UINT8 RGBA authored in perceptual space, so
+      enabling sRGB->linear conversion on read is the correct thing to do --
+      the kernel then does all of its lighting/tonemap math in linear light and
+      re-encodes to sRGB on output (the final pow(c, 1/2.2) below). This is the
+      gamma-correct "sample in linear, tonemap, output" pipeline.
+    - max_anisotropy=8: refraction samples the texture at grazing, stretched
+      angles, which is exactly the case anisotropic filtering is meant to clean
+      up, so we request it on the background texture.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.MIRROR,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        # MIRROR/WRAP addressing modes require normalized coordinates.
+        normalized_coords=True,
+        # 8-bit color image -> decode sRGB to linear on read so the lighting and
+        # tonemap math runs in linear light (re-encoded to sRGB on output).
+        srgb=True,
+        # Refraction samples at grazing/stretched angles; anisotropic filtering
+        # cleans those up.
+        max_anisotropy=MAX_ANISOTROPY,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) ---
+    shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the PBO that CUDA will write into ---
+    pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Allocate the background CUDAArray and upload the image once ---
+    bg_arr = CUDAArray.from_descriptor(
+        shape=(BG_SIZE, BG_SIZE),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    host_image = make_background_image(BG_SIZE)
+    bg_arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
+    stream.sync()
+
+    # --- Step 6: Bind the CUDAArray as a long-lived TextureObject ---
+    #     Created once and kept alive: `launch` is async, so a per-frame texture
+    #     inside a closing `with` would destroy the handle before the kernel ran.
+    bg_tex = make_background_texture(bg_arr)
+
+    # Interactive state. Each ripple slot is (origin_x, origin_y, start_time) in
+    # normalized screen coords / seconds; start_time < 0 means inactive.
+    state = {
+        "strength": DEFAULT_STRENGTH,
+        "ripples": [[0.0, 0.0, -1.0] for _ in range(MAX_RIPPLES)],
+        "next_slot": 0,
+    }
+    start_time = time.monotonic()
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["strength"] = min(MAX_STRENGTH, state["strength"] + STRENGTH_STEP)
+        elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT):
+            state["strength"] = max(MIN_STRENGTH, state["strength"] - STRENGTH_STEP)
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        # pyglet's origin is bottom-left, which matches our normalized UV
+        # convention below (v increases upward). Record into the ring buffer.
+        now = time.monotonic() - start_time
+        slot = state["next_slot"]
+        state["ripples"][slot] = [x / WIDTH, y / HEIGHT, now]
+        state["next_slot"] = (slot + 1) % MAX_RIPPLES
+
+    # --- Step 7: Render loop ---
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        now = time.monotonic()
+        t = now - start_time
+
+        window.clear()
+
+        # Flatten the ripple ring into the scalar args the kernel expects:
+        # for each slot, (origin_x, origin_y, age) where age < 0 == inactive.
+        ripple_args = []
+        for ox, oy, st in state["ripples"]:
+            age = (t - st) if st >= 0.0 else -1.0
+            if age >= RIPPLE_LIFETIME:
+                age = -1.0
+            ripple_args.extend((np.float32(ox), np.float32(oy), np.float32(age)))
+
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(bg_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(t),
+                np.float32(state["strength"]),
+                np.float32(RIPPLE_LIFETIME),
+                *ripple_args,
+            )
+        copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex)
+
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core CUDAArray + TextureObject - Water Caustics "
+                f"(strength={state['strength']:.2f}, {fps:.0f} FPS) "
+                f"| TextureObject[LINEAR|MIRROR|sRGB|aniso={MAX_ANISOTROPY}] UINT8 "
+                "[+/- strength, click = ripple, Esc = quit]"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        bg_tex.close()
+        bg_arr.close()
+        resource.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ============================== GPU code (kernel) ============================
+#
+# render_water samples a static background TextureObject (LINEAR + MIRROR +
+# normalized coords) at refraction-perturbed UVs. The water surface and its
+# normal/curvature are evaluated analytically from a `time` uniform -- there is
+# no second array and no SurfaceObject. MAX_RIPPLES click-ripples arrive as
+# (origin_x, origin_y, age) float triples; age < 0 marks an empty slot.
+#
+# The ripple count is compiled in via the MAX_RIPPLES define so the kernel's
+# parameter list (host side) and the loop bound (device side) stay in lockstep.
+# ============================================================================
+
+KERNEL_SOURCE = (
+    "#define MAX_RIPPLES "
+    + str(MAX_RIPPLES)
+    + "\n"
+    + r"""
+// Analytic water height field at normalized position p and time t. A sum of a
+// few moving directional waves gives the base chop; the expanding circular
+// ripples from clicks ride on top. Returns height; gradient/curvature are taken
+// numerically by sampling this a few times (cheap and robust).
+__device__ __forceinline__
+float water_height(float px, float py, float t,
+                   const float* rip_x, const float* rip_y,
+                   const float* rip_age, float ripple_lifetime) {
+    float h = 0.0f;
+
+    // Directional waves: (dir_x, dir_y, freq, speed, amp).
+    // Hand-picked so they never perfectly align (avoids an obvious repeat).
+    const float waves[5][5] = {
+        { 1.00f,  0.00f,  9.0f,  1.3f, 0.45f},
+        { 0.20f,  0.98f, 12.0f,  1.0f, 0.35f},
+        {-0.70f,  0.71f, 16.0f,  1.7f, 0.25f},
+        { 0.80f, -0.60f, 22.0f,  2.1f, 0.18f},
+        {-0.30f, -0.95f, 31.0f,  2.6f, 0.12f},
+    };
+    #pragma unroll
+    for (int i = 0; i < 5; ++i) {
+        float phase = (waves[i][0] * px + waves[i][1] * py) * waves[i][2]
+                      + t * waves[i][3];
+        h += waves[i][4] * sinf(phase);
+    }
+
+    // Expanding circular ripples from mouse clicks. Each is a decaying radial
+    // wave packet whose ring radius grows with age.
+    for (int r = 0; r < MAX_RIPPLES; ++r) {
+        float age = rip_age[r];
+        if (age < 0.0f) continue;
+        float dx = px - rip_x[r];
+        float dy = py - rip_y[r];
+        float dist = sqrtf(dx * dx + dy * dy);
+        float ring = dist * 40.0f - age * 8.0f;       // outward-moving ring
+        float envelope = expf(-dist * 6.0f);           // localized in space
+        float fade = 1.0f - (age / ripple_lifetime);   // fade over lifetime
+        if (fade < 0.0f) fade = 0.0f;
+        h += 0.9f * fade * envelope * sinf(ring);
+    }
+    return h;
+}
+
+extern "C"
+__global__
+void render_water(cudaTextureObject_t bg,
+                  unsigned char* output,
+                  int width, int height,
+                  float t,
+                  float strength,
+                  float ripple_lifetime,
+"""
+    + "".join(
+        f"                  float rip_x{i}, float rip_y{i}, float rip_age{i}"
+        + (",\n" if i < MAX_RIPPLES - 1 else ") {\n")
+        for i in range(MAX_RIPPLES)
+    )
+    + r"""
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Pack the per-ripple scalars back into arrays so the helper can loop.
+    float rip_x[MAX_RIPPLES];
+    float rip_y[MAX_RIPPLES];
+    float rip_age[MAX_RIPPLES];
+"""
+    + "".join(
+        f"    rip_x[{i}] = rip_x{i}; rip_y[{i}] = rip_y{i}; rip_age[{i}] = rip_age{i};\n" for i in range(MAX_RIPPLES)
+    )
+    + r"""
+    // Normalized screen position. v increases upward to match pyglet's
+    // bottom-left mouse origin used when recording ripple coordinates.
+    float u = (x + 0.5f) / (float)width;
+    float v = 1.0f - (y + 0.5f) / (float)height;
+
+    // Sample the water height field on a small stencil to get the surface
+    // gradient (slope -> refraction) and Laplacian (curvature -> caustics).
+    const float eps = 1.5f / (float)width;
+    float hc = water_height(u, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hl = water_height(u - eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hr = water_height(u + eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hd = water_height(u, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hu = water_height(u, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+
+    float gx = (hr - hl) / (2.0f * eps);   // d(height)/du
+    float gy = (hu - hd) / (2.0f * eps);   // d(height)/dv
+    // Discrete Laplacian (curvature). Divide by eps^2 so it is a true second
+    // derivative -- without this the finite-difference sum is ~Laplacian*eps^2
+    // (tiny), and the caustic term below would collapse to zero.
+    float lap = (hl + hr + hd + hu - 4.0f * hc) / (eps * eps);
+
+    // 2D refraction approximation: bend the background lookup by the surface
+    // slope, scaled by the user `strength`. Small factor keeps it gentle.
+    float refract = 0.015f * strength;
+    float su = u - refract * gx;
+    float sv = v - refract * gy;
+
+    // Sample the background. LINEAR + MIRROR + normalized coords means the
+    // perturbed (su, sv) can leave [0, 1] and still return a smooth, mirrored
+    // pixel rather than a clamped streak or a hard seam. Because the texture was
+    // bound with srgb=True, each channel is already decoded to LINEAR light
+    // here -- so all the lighting/tonemap math below is physically sensible and
+    // we only re-encode to sRGB at the very end.
+    //
+    // Chromatic dispersion: water bends short (blue) wavelengths more than long
+    // (red) ones, so we sample R/G/B at slightly different refraction offsets.
+    // This gives caustic edges and warped grid lines faint rainbow fringes.
+    float disp = 0.30f * refract;                // dispersion spread, in UV
+    float base_r = tex2D<float4>(bg, su - disp * gx, sv - disp * gy).x;
+    float base_b = tex2D<float4>(bg, su + disp * gx, sv + disp * gy).z;
+    float4 base = tex2D<float4>(bg, su, sv);   // green keeps the unsplit UV
+    base.x = base_r;
+    base.z = base_b;
+
+    // Surface normal from the gradient (z component points out of the water).
+    float nx = -gx;
+    float ny = -gy;
+    float nz = 1.0f;
+    float ninv = rsqrtf(nx * nx + ny * ny + nz * nz);
+    nx *= ninv; ny *= ninv; nz *= ninv;
+
+    // Caustics: light focuses where the wavefront converges (negative
+    // curvature). Raise a sharpened function of the curvature to a power to get
+    // tight bright filaments, then add as a cyan/white highlight.
+    // The wave-sum Laplacian peaks around O(150-200), so this factor lands
+    // `focus` near O(1) at a converging wavefront.
+    float focus = -lap * 0.005f;
+    if (focus < 0.0f) focus = 0.0f;
+    float caustic = focus * focus * focus;       // sharpen into thin filaments
+    caustic *= (0.6f + 0.8f * strength);
+    if (caustic > 1.5f) caustic = 1.5f;
+
+    // Specular sparkle: normal vs a fixed light direction.
+    float lx = 0.4f, ly = 0.5f, lz = 0.768f;     // normalized-ish light dir
+    float spec = nx * lx + ny * ly + nz * lz;
+    if (spec < 0.0f) spec = 0.0f;
+    spec = powf(spec, 48.0f);
+
+    // Animated light shafts / god-rays: angled bright bands that drift and
+    // breathe over time, as if sunlight were cutting down through the water.
+    // Built purely from (u, v, t) -- no extra launch args. The shafts are
+    // gated by the surface slope so they ripple with the waves and the water
+    // curvature concentrates them into bright filaments where the wavefront
+    // focuses, reinforcing the caustics.
+    float shaft_dir = u * 7.5f + v * 3.0f;       // angled across the screen
+    float shafts = 0.5f + 0.5f * sinf(shaft_dir + t * 0.7f + 1.5f * gx);
+    shafts *= 0.5f + 0.5f * sinf(shaft_dir * 0.37f - t * 0.4f);
+    shafts = powf(shafts, 3.0f);                 // crush into thin shafts
+    float godray = shafts * (0.18f + 0.45f * focus);
+
+    // Depth tint: deeper troughs read bluer/darker, crests slightly brighter.
+    float depth = 0.5f + 0.5f * hc;              // ~[0, 1]
+    float tint_r = 0.85f + 0.15f * depth;
+    float tint_g = 0.92f + 0.08f * depth;
+    float tint_b = 1.05f - 0.10f * depth;
+
+    // Composite in LINEAR light. Caustics get a faint warm/cool split and the
+    // god-rays a sunlit warm bias so the bright filaments read as light, not
+    // just blown-out white.
+    float cr = base.x * tint_r + caustic * 0.95f + spec * 0.9f + godray * 1.10f;
+    float cg = base.y * tint_g + caustic * 1.00f + spec * 0.9f + godray * 1.00f;
+    float cb = base.z * tint_b + caustic * 1.05f + spec * 1.0f + godray * 0.80f;
+
+    // Simple Reinhard tonemap so highlights roll off instead of clipping hard.
+    cr = cr / (1.0f + cr);
+    cg = cg / (1.0f + cg);
+    cb = cb / (1.0f + cb);
+
+    // Encode LINEAR -> sRGB on output. This is the matching half of the
+    // srgb=True decode on the texture read: we sampled and lit in linear, and
+    // now re-encode for the 8-bit RGBA8 PBO. The ~1/2.2 exponent is the
+    // gamma-correct encode (and also lifts the midtones the linear decode
+    // darkened, so the pool reads luminous rather than murky).
+    cr = powf(cr, 1.0f / 2.2f);
+    cg = powf(cg, 1.0f / 2.2f);
+    cb = powf(cb, 1.0f / 2.2f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(fminf(cr, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(fminf(cg, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(fminf(cb, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+)
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_clouds.py b/cuda_core/examples/gl_interop_clouds.py
new file mode 100644
index 00000000000..bc8829674ef
--- /dev/null
+++ b/cuda_core/examples/gl_interop_clouds.py
@@ -0,0 +1,991 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by
+# baking a procedural fractal-noise density volume once at startup and then
+# ray-marching it every frame as participating media to render fluffy, sunlit,
+# semi-transparent clouds. The SurfaceObject is used during the one-shot bake;
+# the TextureObject (with LINEAR + WRAP + normalized coords) drives the per-frame
+# volumetric ray march with Beer-Lambert absorption and self-shadowing. The
+# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and
+#   bind it as both a SurfaceObject (for one-shot kernel writes via surf3Dwrite)
+#   and a TextureObject (for hardware-accelerated trilinear tex3D sampling).
+# - How to ray-march a baked scalar density volume as PARTICIPATING MEDIA: this
+#   goes beyond gl_interop_sdf_volume.py (which renders a hard SDF surface). Here
+#   the volume is fog: we accumulate color and transmittance front-to-back and
+#   apply Beer-Lambert absorption, with a short secondary march toward the sun
+#   for self-shadowing.
+# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop.
+#
+# How it works
+# ============
+# A single-channel float (FLOAT32) 3D volume (96^3) is filled once at
+# startup with fractal Brownian motion (fbm) built from a cheap integer-hash
+# value noise:
+#
+#     fbm(p) = sum over octaves of amplitude * value_noise(p * frequency)
+#     density = remap(fbm) with a coverage threshold
+#
+# The volume stores only the raw noise; the cloud SHAPING (coverage threshold +
+# a vertical height falloff that fades density near the top and bottom of the
+# box) is applied in the RENDER kernel, not baked. That lets us ANIMATE the
+# clouds for free by scrolling the sample coordinate with a `time` uniform
+# (cheaper than re-baking 96^3 every frame, which would stack a second 3D launch
+# on top of the already heavy raymarch). WRAP addressing avoids clamping the
+# scrolled coordinate at the box edge (the baked field is not perfectly
+# tileable, so a faint density seam sweeps through slowly); the ray-vs-box bail
+# is what keeps density zero outside the volume, so WRAP is safe here.
+#
+#   STARTUP (one-shot bake)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate 3D CUDAArray (96^3, FLOAT32 x1, is_surface_load_store=True).
+#   2. Bind it as a SurfaceObject.
+#   3. Launch `bake_density`: one thread per voxel writes fbm via surf3Dwrite.
+#   4. Close the SurfaceObject; the CUDAArray stays alive.
+#
+#   EACH FRAME
+#   ~~~~~~~~~~
+#   1. resource.map() -> CUDA device pointer into the OpenGL PBO.
+#   2. Launch `render_clouds` (one thread per pixel). It builds an orbit-camera
+#      ray, intersects the [-1,1]^3 box, marches front-to-back sampling density
+#      via tex3D<float> (LINEAR + WRAP + normalized coords), shades each sample
+#      with a short sun-ward shadow march (Beer-Lambert), accumulates over an
+#      analytic sky, and writes RGBA8 straight into the PBO.
+#   3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad.
+#
+# Performance note
+# ================
+# This is the most compute-heavy example here: a primary march (up to ~96 steps)
+# with a nested secondary shadow march (~6 steps) per sample is O(steps^2) work
+# per pixel. To keep it interactive we use a modest 96^3 volume, cap the step
+# counts, and EARLY-OUT once transmittance drops below ~0.01. Lower
+# PRIMARY_STEPS / VOLUME_SIZE if your GPU struggles.
+#
+# Controls
+# ========
+#   Left mouse drag    orbit camera (dx -> yaw, dy -> pitch)
+#   Arrow keys         orbit camera (keyboard alternative)
+#   Mouse wheel        zoom (camera distance)
+#   + / -              raise / lower the sun (changes light angle + sky glow)
+#   [ / ]              decrease / increase cloud coverage (more / less cloud)
+#   R                  reset camera + sun + coverage
+#   Escape / close     quit
+#
+# The window title shows yaw, pitch, distance, sun height, coverage, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+VOLUME_SIZE = 96  # 96^3 voxels; bake cost is one-shot. Lower if memory is tight.
+
+# Camera defaults / clamps.
+RESET_YAW = 0.6
+RESET_PITCH = 0.25
+RESET_DIST = 3.2
+PITCH_MIN = -1.45  # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MAX = 1.45
+DIST_MIN = 1.5
+DIST_MAX = 9.0
+
+# Lighting / shaping defaults and clamps.
+RESET_SUN_HEIGHT = 0.55  # 0 = sun at horizon, 1 = sun overhead.
+SUN_HEIGHT_MIN = 0.05
+SUN_HEIGHT_MAX = 0.98
+RESET_COVERAGE = 0.50  # higher = more cloud (lower density threshold).
+COVERAGE_MIN = 0.20
+COVERAGE_MAX = 0.85
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """3D arrays + bindless surface/texture objects require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the two kernels and return (device, stream, kernels)."""
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    # C++ is required so the templated tex3D<float> / surf3Dwrite<float>
+    # overloads resolve. extern "C" on the kernel symbols keeps the function
+    # names unmangled even when the rest of the TU is compiled as C++.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_density", "render_clouds"),
+    )
+    kernels = {
+        "bake": mod.get_kernel("bake_density"),
+        "render": mod.get_kernel("render_clouds"),
+    }
+    return dev, stream, kernels
+
+
+def make_volume_array():
+    """Allocate the 3D density volume. Single-channel float, surface-capable.
+
+    API MAP
+    =======
+    - 3D CUDAArray shape=(W,H,D): CUDAArray.from_descriptor allocates a 96^3
+      single-channel array (cuArray3DCreate under the hood). This is the
+      headline of the example: a true 3D, hardware-laid-out array sampled
+      trilinearly from a kernel.
+    - tex3D trilinear (FilterMode.LINEAR) + normalized coords: configured by
+      make_volume_texture below; gives free hardware trilinear sampling, the
+      thing that makes a smooth volumetric raymarch cheap.
+    - surf3Dwrite typed store during the one-shot bake: bind the same CUDAArray
+      as a SurfaceObject (is_surface_load_store=True) and write one density per
+      voxel; the byte x-offset uses sizeof(float) because surf3Dwrite's x
+      coordinate is in BYTES (y, z are in elements).
+    """
+    return CUDAArray.from_descriptor(
+        shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_volume_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.
+
+    WRAP (not CLAMP) is the right choice here: the render kernel scrolls the
+    sample coordinate by a time uniform to animate the clouds, and WRAP avoids
+    clamping (smearing) the edge texels as the coordinate drifts past [0, 1].
+    The baked field is not perfectly tileable, so a faint density seam sweeps
+    through slowly as the scroll wraps -- a minor demo-grade artifact, not a
+    crash. WRAP/MIRROR addressing modes require normalized coordinates. The
+    ray-vs-box bail in the raymarch is what keeps density zero outside the
+    [-1, 1]^3 volume, so wrapping the noise field never leaks cloud outside it.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def bake_volume(stream, kernels, arr):
+    """Run the one-shot bake kernel that fills the volume with fractal noise.
+
+    The SurfaceObject lives only for the duration of this call; once the bake
+    is enqueued and the kernel has captured the bindless handle into its
+    arguments, we sync the stream before letting the SurfaceObject close.
+    The CUDAArray itself outlives this scope -- it's the long-lived backing
+    store for the render-loop TextureObject.
+    """
+    with SurfaceObject.from_array(arr) as bake_surf:
+        block = (8, 8, 8)
+        grid = (
+            (VOLUME_SIZE + block[0] - 1) // block[0],
+            (VOLUME_SIZE + block[1] - 1) // block[1],
+            (VOLUME_SIZE + block[2] - 1) // block[2],
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["bake"],
+            np.uint64(bake_surf.handle),
+            np.int32(VOLUME_SIZE),
+        )
+        # Synchronize before the SurfaceObject context exits so the bindless
+        # handle is still valid while the kernel runs.
+        stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: shader, fullscreen quad, empty texture.
+
+    Not CUDA-specific; identical to the other gl_interop_* examples.
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the 3D density volume and bake it once ---
+    #     The CUDAArray is the long-lived backing store; it must outlive the
+    #     render loop. The SurfaceObject is only needed for the one-shot bake
+    #     and is closed before we ever bind a TextureObject to the same CUDAArray.
+    arr = make_volume_array()
+    bake_volume(stream, kernels, arr)
+
+    # --- Step 3: Bind the volume as a trilinear TextureObject ---
+    #     LINEAR + WRAP + normalized_coords gives free hardware trilinear
+    #     filtering plus seamless wrapping for the animated coordinate scroll.
+    volume_tex = make_volume_texture(arr)
+
+    # --- Step 4: Open a window and set up the CUDA/GL bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # Camera is orbit-style: yaw and pitch are angles, dist is the orbit
+    # radius. sun_height drives the light direction + sky glow; coverage shapes
+    # how much of the noise field reads as cloud. The render kernel turns these
+    # into rays + shading itself.
+    state = {
+        "yaw": RESET_YAW,
+        "pitch": RESET_PITCH,
+        "dist": RESET_DIST,
+        "sun_height": RESET_SUN_HEIGHT,
+        "coverage": RESET_COVERAGE,
+    }
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+    last_fps = [0.0]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+        elapsed = time.monotonic() - start_time
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the volumetric raymarch kernel. Camera + lighting +
+            #     shaping params are passed as scalars; the kernel builds the
+            #     orbit eye, per-pixel ray, and clouds itself. `time` scrolls
+            #     the noise sample coordinate to animate the clouds.
+            launch(
+                stream,
+                config,
+                kernels["render"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(volume_tex.handle),
+                np.float32(state["yaw"]),
+                np.float32(state["pitch"]),
+                np.float32(state["dist"]),
+                np.float32(state["sun_height"]),
+                np.float32(state["coverage"]),
+                np.float32(elapsed),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 0.5:
+            last_fps[0] = frame_count[0] / (now - fps_time[0])
+            frame_count[0] = 0
+            fps_time[0] = now
+            window.set_caption(
+                "cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher  "
+                f"yaw={state['yaw']:+.2f} pitch={state['pitch']:+.2f} "
+                f"dist={state['dist']:.2f} sun={state['sun_height']:.2f} "
+                f"cov={state['coverage']:.2f}  "
+                f"{last_fps[0]:.0f} FPS  |  "
+                "3D CUDAArray[FLOAT32,1ch] + tex3D[LINEAR|WRAP|norm] + surf3D bake"
+            )
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        # Left-click drag orbits the camera. dx -> yaw, dy -> pitch.
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        orbit_scale = 0.005
+        state["yaw"] += dx * orbit_scale
+        state["pitch"] += dy * orbit_scale
+        if state["pitch"] < PITCH_MIN:
+            state["pitch"] = PITCH_MIN
+        elif state["pitch"] > PITCH_MAX:
+            state["pitch"] = PITCH_MAX
+
+    @window.event
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+        # Scroll wheel zoom: geometric so each tick feels uniform. Positive
+        # scroll_y (wheel up) zooms in.
+        if scroll_y == 0:
+            return
+        state["dist"] *= 0.9**scroll_y
+        if state["dist"] < DIST_MIN:
+            state["dist"] = DIST_MIN
+        elif state["dist"] > DIST_MAX:
+            state["dist"] = DIST_MAX
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        keyboard_orbit = 0.08
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.R:
+            state["yaw"] = RESET_YAW
+            state["pitch"] = RESET_PITCH
+            state["dist"] = RESET_DIST
+            state["sun_height"] = RESET_SUN_HEIGHT
+            state["coverage"] = RESET_COVERAGE
+        elif symbol == key.LEFT:
+            state["yaw"] -= keyboard_orbit
+        elif symbol == key.RIGHT:
+            state["yaw"] += keyboard_orbit
+        elif symbol == key.UP:
+            state["pitch"] = min(PITCH_MAX, state["pitch"] + keyboard_orbit)
+        elif symbol == key.DOWN:
+            state["pitch"] = max(PITCH_MIN, state["pitch"] - keyboard_orbit)
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["sun_height"] = min(SUN_HEIGHT_MAX, state["sun_height"] + 0.05)
+        elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT):
+            state["sun_height"] = max(SUN_HEIGHT_MIN, state["sun_height"] - 0.05)
+        elif symbol == key.BRACKETLEFT:
+            state["coverage"] = max(COVERAGE_MIN, state["coverage"] - 0.03)
+        elif symbol == key.BRACKETRIGHT:
+            state["coverage"] = min(COVERAGE_MAX, state["coverage"] + 0.03)
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse construction order. The GL objects
+        # clean up via pyglet on window close.
+        resource.close()
+        volume_tex.close()
+        arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Two CUDA C++ kernels are concatenated into one program string so they share
+# a single NVRTC compile. NOTE: with no GPU available at authoring time, the
+# noise/raymarch math below is unverified at runtime -- it is kept deliberately
+# conservative (integer-hash value noise, plain fbm, no STL / host-only calls)
+# so it compiles cleanly under NVRTC c++17.
+#
+#   bake_density   -- one thread per voxel. Evaluates fractal Brownian motion
+#                     (fbm) of a cheap integer-hash value noise and writes the
+#                     raw scalar via surf3Dwrite. NOTE: surf3Dwrite's
+#                     x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so
+#                     multiply by sizeof(float). y and z are in elements
+#                     -- a classic CUDA gotcha.
+#
+#   render_clouds  -- one thread per screen pixel. Builds the orbit-camera ray,
+#                     intersects the [-1, 1]^3 box, marches front-to-back
+#                     sampling density via tex3D<float> (LINEAR + WRAP +
+#                     normalized coords, coordinate scrolled by `time`), applies
+#                     a coverage threshold + vertical height falloff, does a
+#                     short sun-ward shadow march per sample (Beer-Lambert),
+#                     accumulates color + transmittance, composites over an
+#                     analytic sky, and writes RGBA8 into the PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Small inline helpers.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float dot3(float ax, float ay, float az,
+                                      float bx, float by, float bz) {
+    return ax * bx + ay * by + az * bz;
+}
+
+__device__ __forceinline__ float length3(float x, float y, float z) {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+__device__ __forceinline__ float lerpf(float a, float b, float t) {
+    return a + (b - a) * t;
+}
+
+__device__ __forceinline__ float smoothstepf(float t) {
+    // Hermite fade curve used both for noise interpolation and shaping.
+    return t * t * (3.0f - 2.0f * t);
+}
+
+// --------------------------------------------------------------------------
+// Cheap integer-hash value noise + fractal Brownian motion (fbm).
+//
+// hash3() turns an integer lattice point into a pseudo-random float in [0,1].
+// value_noise() trilinearly interpolates the 8 lattice corners around a
+// floating-point position with a smoothstep fade. fbm() sums several octaves
+// of value_noise at doubling frequency / halving amplitude. All integer math,
+// no tables, no host-only calls -- NVRTC-friendly.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float hash3(int ix, int iy, int iz) {
+    unsigned int h = (unsigned int)ix * 374761393u +
+                     (unsigned int)iy * 668265263u +
+                     (unsigned int)iz * 2147483647u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    return (float)(h & 0x00ffffffu) / (float)0x01000000u;  // [0, 1)
+}
+
+__device__ __forceinline__ float value_noise(float x, float y, float z) {
+    float fx = floorf(x), fy = floorf(y), fz = floorf(z);
+    int ix = (int)fx, iy = (int)fy, iz = (int)fz;
+    float tx = smoothstepf(x - fx);
+    float ty = smoothstepf(y - fy);
+    float tz = smoothstepf(z - fz);
+
+    float c000 = hash3(ix,     iy,     iz);
+    float c100 = hash3(ix + 1, iy,     iz);
+    float c010 = hash3(ix,     iy + 1, iz);
+    float c110 = hash3(ix + 1, iy + 1, iz);
+    float c001 = hash3(ix,     iy,     iz + 1);
+    float c101 = hash3(ix + 1, iy,     iz + 1);
+    float c011 = hash3(ix,     iy + 1, iz + 1);
+    float c111 = hash3(ix + 1, iy + 1, iz + 1);
+
+    float x00 = lerpf(c000, c100, tx);
+    float x10 = lerpf(c010, c110, tx);
+    float x01 = lerpf(c001, c101, tx);
+    float x11 = lerpf(c011, c111, tx);
+    float y0  = lerpf(x00, x10, ty);
+    float y1  = lerpf(x01, x11, ty);
+    return lerpf(y0, y1, tz);
+}
+
+__device__ __forceinline__ float fbm(float x, float y, float z) {
+    float sum = 0.0f;
+    float amp = 0.5f;
+    float freq = 1.0f;
+    #pragma unroll
+    for (int o = 0; o < 5; ++o) {
+        sum += amp * value_noise(x * freq, y * freq, z * freq);
+        freq *= 2.0f;
+        amp  *= 0.5f;
+    }
+    return sum;  // roughly in [0, 1)
+}
+
+// --------------------------------------------------------------------------
+// bake_density: one thread per voxel writes raw fbm into the volume via a
+//               SurfaceObject. The cloud SHAPING (coverage threshold + height
+//               falloff) is applied later in render_clouds so the threshold and
+//               fade stay fixed while the render kernel scrolls the coordinate
+//               for animation.
+//
+//   surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with
+//   is_surface_load_store=True.
+//   surf3Dwrite's x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so
+//   multiply x by sizeof(float). y and z are in elements -- a classic CUDA
+//   gotcha.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void bake_density(cudaSurfaceObject_t surf, int size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    if (x >= size || y >= size || z >= size) return;
+
+    // Voxel-center position mapped into a few noise cells so fbm has structure
+    // across the volume. ~4 base cells across the volume gives puffy blobs.
+    const float NOISE_SCALE = 4.0f;
+    float fx = ((float)x + 0.5f) / (float)size;
+    float fy = ((float)y + 0.5f) / (float)size;
+    float fz = ((float)z + 0.5f) / (float)size;
+
+    float n = fbm(fx * NOISE_SCALE, fy * NOISE_SCALE, fz * NOISE_SCALE);
+
+    // FLOAT32 store: surf3Dwrite's x offset is in BYTES (x * sizeof(float)).
+    surf3Dwrite(n, surf, x * (int)sizeof(float), y, z);
+}
+
+// --------------------------------------------------------------------------
+// Density sampler: tex3D wants normalized coords in [0, 1]; the volume covers
+// [-1, 1] in world space, so we remap with (p + 1) * 0.5 and add a time-based
+// scroll (WRAP addressing wraps it without edge clamping). The raw fbm is then shaped into
+// a cloud density with:
+//   - a coverage threshold (higher `coverage` -> lower threshold -> more cloud)
+//   - a vertical height falloff that fades density near the top and bottom of
+//     the box so clouds float in a slab rather than filling the whole cube.
+// Returns density >= 0 (0 = clear air).
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float sample_density(cudaTextureObject_t tex,
+                                                 float px, float py, float pz,
+                                                 float coverage, float t) {
+    // Slow horizontal drift + gentle vertical bob for evolving clouds.
+    float u = (px + 1.0f) * 0.5f + t * 0.015f;
+    float v = (py + 1.0f) * 0.5f + t * 0.004f;
+    float w = (pz + 1.0f) * 0.5f + t * 0.010f;
+    float n = tex3D<float>(tex, u, v, w);
+
+    // Coverage threshold: subtract a threshold and rescale so values below it
+    // become clear air. coverage in [0,1] maps to threshold in [~0.8, ~0.15].
+    float threshold = lerpf(0.80f, 0.15f, coverage);
+    float d = (n - threshold) / fmaxf(1.0f - threshold, 1e-3f);
+    d = clampf(d, 0.0f, 1.0f);
+
+    // Vertical height falloff: py in [-1, 1]. Fade to zero near the top/bottom
+    // so clouds form a horizontal band. Peak density around py ~ -0.1.
+    float h = clampf((py + 1.0f) * 0.5f, 0.0f, 1.0f);   // [0,1] bottom->top
+    float falloff = smoothstepf(clampf(h * 4.0f, 0.0f, 1.0f)) *
+                    smoothstepf(clampf((1.0f - h) * 2.5f, 0.0f, 1.0f));
+
+    return d * falloff;
+}
+
+// --------------------------------------------------------------------------
+// render_clouds: one thread per screen pixel. Volumetric ray march of the
+// density volume as participating media.
+//
+// Camera math (orbit, look-at origin, world-up (0, 1, 0)) matches the SDF
+// example. Per pixel:
+//   1. Build the ray, intersect the [-1, 1]^3 AABB (slab method).
+//   2. March front-to-back from the entry point. At each step sample density;
+//      if positive, do a SHORT secondary march toward the sun to estimate how
+//      much light reaches this sample (Beer-Lambert: exp(-sum*absorption)).
+//   3. Accumulate color and transmittance front-to-back. Early-out when
+//      transmittance < 0.01 (rest of the ray is occluded -> big speedup).
+//   4. Composite the accumulated cloud color over an analytic sky gradient
+//      (horizon-to-zenith blue + a sun glow), tonemap, write RGBA8.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_clouds(unsigned char* output,
+                   int width,
+                   int height,
+                   cudaTextureObject_t tex,
+                   float yaw,
+                   float pitch,
+                   float dist,
+                   float sun_height,
+                   float coverage,
+                   float t) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // ---- Build the orbit camera basis ----------------------------------
+    float cp = cosf(pitch), sp = sinf(pitch);
+    float cyw = cosf(yaw),  syw = sinf(yaw);
+
+    float ex = dist * cp * cyw;
+    float ey = dist * sp;
+    float ez = dist * cp * syw;
+
+    float fl = length3(ex, ey, ez);
+    if (fl < 1e-6f) fl = 1e-6f;
+    float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl;
+
+    // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0).
+    float rx = -fz;
+    float ry = 0.0f;
+    float rz = fx;
+    float rl = length3(rx, ry, rz);
+    if (rl < 1e-6f) rl = 1e-6f;
+    rx /= rl; ry /= rl; rz /= rl;
+
+    // up' = cross(right, fwd).
+    float ux = ry * fz - rz * fy;
+    float uy = rz * fx - rx * fz;
+    float uz = rx * fy - ry * fx;
+
+    // ---- Per-pixel ray direction ---------------------------------------
+    float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width  - 1.0f;
+    float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f;
+
+    const float TAN_HALF = 0.41421356237309515f;       // tanf(45deg / 2)
+    float aspect = (float)width / (float)height;
+
+    float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux;
+    float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy;
+    float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz;
+    float dl = length3(dx, dy, dz);
+    if (dl < 1e-6f) dl = 1e-6f;
+    dx /= dl; dy /= dl; dz /= dl;
+
+    // ---- Sun direction from sun_height ---------------------------------
+    // sun_height in [0,1]: 0 -> near horizon, 1 -> overhead. Keep a fixed
+    // azimuth so the light feels stable while orbiting.
+    float sun_el = sun_height * 1.4707963f;            // up to ~84 degrees
+    float se = sinf(sun_el), ce = cosf(sun_el);
+    const float SUN_AZ = 0.7853981633974483f;          // 45 deg azimuth
+    float lx = ce * cosf(SUN_AZ);
+    float ly = se;
+    float lz = ce * sinf(SUN_AZ);
+    float ll = length3(lx, ly, lz);
+    if (ll < 1e-6f) ll = 1e-6f;
+    lx /= ll; ly /= ll; lz /= ll;
+
+    // ---- Ray vs. the [-1, 1]^3 box (slab method) -----------------------
+    float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f));
+    float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f));
+    float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f));
+    float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx;
+    float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy;
+    float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz;
+    float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z));
+    float tFar  = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z));
+
+    // Accumulators: front-to-back compositing. transmittance starts at 1
+    // (fully clear); accumulated radiance starts at 0.
+    float trans = 1.0f;
+    float acc_r = 0.0f, acc_g = 0.0f, acc_b = 0.0f;
+
+    // Cloud material + lighting constants.
+    const float ABSORPTION   = 6.0f;    // primary extinction per unit density
+    const float SUN_ABSORP   = 8.0f;    // shadow-ray extinction per unit density
+    const float STEP_LEN     = 2.0f / 96.0f;   // ~one voxel at 96^3
+    const int   PRIMARY_STEPS = 96;
+    const int   SHADOW_STEPS   = 6;
+    const float SHADOW_STEP_LEN = 0.06f;
+
+    // Henyey-Greenstein forward-scattering phase function. g>0 biases scatter
+    // toward the light direction, producing the bright "silver lining" rim when
+    // the view ray points toward the sun. cos(theta) = dot(view_dir, sun_dir);
+    // both are unit length here. phase = (1-g^2) / (4pi * (1+g^2-2g*cos)^1.5).
+    // The constant 1/(4pi) factor is folded into the lighting scale below, so
+    // we only keep the angular shape that drives the glow.
+    const float HG_G = 0.6f;
+    float cos_vl = dot3(dx, dy, dz, lx, ly, lz);
+    float hg_denom = 1.0f + HG_G * HG_G - 2.0f * HG_G * cos_vl;
+    float hg_phase = (1.0f - HG_G * HG_G) / (hg_denom * sqrtf(fmaxf(hg_denom, 1e-4f)));
+
+    if (tFar > fmaxf(tNear, 0.0f)) {
+        float tcur = fmaxf(tNear, 0.0f) + 1e-4f;
+
+        #pragma unroll 1
+        for (int i = 0; i < PRIMARY_STEPS; ++i) {
+            if (tcur > tFar) break;
+
+            float pxw = ex + tcur * dx;
+            float pyw = ey + tcur * dy;
+            float pzw = ez + tcur * dz;
+
+            float density = sample_density(tex, pxw, pyw, pzw, coverage, t);
+
+            if (density > 1e-3f) {
+                // ---- Secondary march toward the sun for self-shadowing ----
+                float shadow_sum = 0.0f;
+                #pragma unroll
+                for (int s = 1; s <= SHADOW_STEPS; ++s) {
+                    float st = (float)s * SHADOW_STEP_LEN;
+                    float sxw = pxw + lx * st;
+                    float syw = pyw + ly * st;
+                    float szw = pzw + lz * st;
+                    // Stop sampling outside the box (no density there anyway).
+                    if (fabsf(sxw) > 1.0f || fabsf(syw) > 1.0f || fabsf(szw) > 1.0f) {
+                        break;
+                    }
+                    shadow_sum += sample_density(tex, sxw, syw, szw, coverage, t);
+                }
+                float sun_trans = expf(-shadow_sum * SUN_ABSORP * SHADOW_STEP_LEN);
+
+                // Powder ("dark edge") term: thin cloud edges scatter less light
+                // back than a naive 1-exp model predicts, so darken low-density
+                // samples for fluffier, more rounded volumes. Saturates toward 1
+                // in dense cloud (cores stay bright); only thin edges are dimmed.
+                // Apply as a gentle modulation so cores keep full sunlight.
+                float powder = 0.4f + 0.6f * (1.0f - expf(-density * 3.0f));
+
+                // Beer-Lambert extinction for this slab of the primary ray.
+                float slab_trans = expf(-density * ABSORPTION * STEP_LEN);
+                float absorbed = trans * (1.0f - slab_trans);
+
+                // Direct sunlight reaching this sample, shaped by the HG phase so
+                // it spikes when looking toward the sun (silver lining). Add a
+                // small ambient floor so shadowed cores stay bluish, not black.
+                float sun_light = sun_trans * (0.4f + 1.6f * hg_phase) * powder;
+                float lit = clampf(0.15f + sun_light, 0.0f, 1.6f);
+                float cr = lerpf(0.42f, 1.05f, clampf(lit, 0.0f, 1.0f)) + 0.05f * fmaxf(lit - 1.0f, 0.0f);
+                float cg = lerpf(0.48f, 0.99f, clampf(lit, 0.0f, 1.0f)) + 0.04f * fmaxf(lit - 1.0f, 0.0f);
+                float cb = lerpf(0.62f, 0.92f, clampf(lit, 0.0f, 1.0f));
+
+                acc_r += absorbed * cr;
+                acc_g += absorbed * cg;
+                acc_b += absorbed * cb;
+                trans *= slab_trans;
+
+                if (trans < 0.01f) break;   // remaining ray fully occluded
+            }
+
+            tcur += STEP_LEN;
+        }
+    }
+
+    // ---- Analytic sky behind / through the clouds ----------------------
+    // Vertical gradient from a pale horizon to a deeper zenith blue, plus a
+    // soft sun glow where the ray direction aligns with the sun.
+    float up_amt = clampf(0.5f * (dy + 1.0f), 0.0f, 1.0f);
+    float sky_r = lerpf(0.70f, 0.18f, up_amt);
+    float sky_g = lerpf(0.80f, 0.34f, up_amt);
+    float sky_b = lerpf(0.92f, 0.62f, up_amt);
+
+    // Sun glow + a crisp sun disk. The broad glow uses a moderate power; the
+    // disk is a high-power lobe that reads as a bright, slightly warm sun.
+    float sun_dot = clampf(dot3(dx, dy, dz, lx, ly, lz), 0.0f, 1.0f);
+    float glow = powf(sun_dot, 64.0f);
+    float disk = powf(sun_dot, 2048.0f);
+    sky_r += glow * 0.8f + disk * 6.0f;
+    sky_g += glow * 0.7f + disk * 5.4f;
+    sky_b += glow * 0.5f + disk * 3.6f;
+
+    // Composite: accumulated cloud radiance over the sky weighted by the
+    // remaining transmittance.
+    float r = acc_r + trans * sky_r;
+    float g = acc_g + trans * sky_g;
+    float b = acc_b + trans * sky_b;
+
+    // Simple Reinhard tonemap to keep the sun glow from blowing out.
+    r = r / (1.0f + r);
+    g = g / (1.0f + g);
+    b = b / (1.0f + b);
+    // Mild gamma for a punchier image.
+    r = powf(clampf(r, 0.0f, 1.0f), 0.85f);
+    g = powf(clampf(g, 0.0f, 1.0f), 0.85f);
+    b = powf(clampf(b, 0.0f, 1.0f), 0.85f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_fluid.py b/cuda_core/examples/gl_interop_fluid.py
new file mode 100644
index 00000000000..1423580fcdb
--- /dev/null
+++ b/cuda_core/examples/gl_interop_fluid.py
@@ -0,0 +1,1251 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. It runs a
+# real-time Stable Fluids (Jos Stam) smoke/ink solver entirely on the GPU:
+# velocity, pressure, and dye fields live in ping-ponged CUDA arrays, are read
+# through TextureObjects with free hardware bilinear filtering (the heart of
+# semi-Lagrangian advection), and written back through SurfaceObjects. The dye
+# is colorized straight into an OpenGL PBO. Drag the mouse to inject swirling
+# ink. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How semi-Lagrangian advection uses tex2D LINEAR sampling: trace each cell
+#   backward along the velocity field and read the old quantity with free
+#   hardware bilinear interpolation (no manual lerp, no neighbor gather).
+# - How to drive several distinct kernels (advect, divergence, Jacobi pressure
+#   solve, gradient subtraction, dye advect, colorize) over a shared set of
+#   pre-created TextureObject/SurfaceObject handles, ping-ponging multiple
+#   fields without recreating handles per frame.
+# - How to fold live mouse input into a GPU simulation: capture the mouse delta
+#   and splat velocity + dye into the field via a SurfaceObject (in-place
+#   read-modify-write, one thread per cell -> no race).
+#
+# How it works
+# ============
+# Stam's "Stable Fluids" solves the incompressible Navier-Stokes equations on a
+# regular grid by splitting each step into stages that are each individually
+# stable:
+#
+#   1. ADVECT VELOCITY  - move the velocity field along itself. For each cell we
+#      back-trace its center one timestep against the local velocity and read
+#      the old velocity there with tex2D<float2> LINEAR (bilinear). This is the
+#      unconditionally-stable semi-Lagrangian scheme.
+#   2. SPLAT (input)    - add the mouse-drag velocity and a dab of dye in a soft
+#      radial brush around the cursor (in-place on the velocity/dye surfaces).
+#   3. DIVERGENCE       - compute div(velocity), the amount each cell is a
+#      source/sink. An incompressible fluid must have zero divergence.
+#   4. PRESSURE SOLVE   - Jacobi-iterate the Poisson equation lap(p) = div,
+#      ping-ponging two pressure buffers for ~30 iterations.
+#   5. SUBTRACT GRADIENT- v <- v - grad(p). This projects the velocity onto its
+#      divergence-free part, enforcing incompressibility.
+#   6. ADVECT DYE       - move the ink along the (now divergence-free) velocity,
+#      again with tex2D LINEAR back-tracing.
+#   7. COLORIZE         - map dye density through a vivid gradient into the PBO.
+#
+#   PING-PONG (read one array, write the other, then swap)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-----------+   tex2D<float2> LINEAR   +-------------+   surf2Dwrite   +-----------+
+#   |  vel_a    | -----------------------> |  advect /   | --------------> |  vel_b    |
+#   | (vx, vy)  |                          |  jacobi /   |                 | (vx, vy)  |
+#   +-----------+                          |  advect_dye |                 +-----------+
+#        ^                                 +-------------+                       |
+#        +-------------------------------- (swap) ------------------------------+
+#
+# Why LINEAR + CLAMP + normalized coords?
+# ---------------------------------------
+# Semi-Lagrangian advection traces a cell's center back to an arbitrary
+# fractional position and needs the interpolated field value there. LINEAR
+# filtering gives that bilinear interpolation for free in hardware. We use a
+# bounded box (CLAMP) rather than a torus so ink piles up against the walls
+# instead of wrapping. CLAMP, like all addressing modes, behaves cleanly with
+# normalized coordinates, and we sample at texel centers `(i + 0.5) / N` so a
+# zero-velocity cell reads back exactly its own value.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. Velocity is a
+# `float2` (8 bytes) so its x offset is `x * sizeof(float2)`; pressure and
+# divergence are `float` (4 bytes, `x * sizeof(float)`); the dye is a `float4`
+# RGBA color (16 bytes, `x * sizeof(float4)`). Getting this wrong silently
+# corrupts every other column.
+#
+# What you should see
+# ===================
+# Big blobs of saturated color are dropped into the fluid every fraction of a
+# second and immediately billow, swirl, and mix into turbulent ribbons that
+# fill the window -- "ink dropped in water." Drag the mouse to paint your own
+# rainbow ink. Press R to clear, Escape to exit. The window title shows the
+# current FPS, pressure-iteration count, and live texture/surface config.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import colorsys
+import ctypes
+import math
+import random
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+DT = 1.0  # simulation timestep
+PRESSURE_ITERS = 30  # Jacobi iterations for the pressure solve per frame
+VELOCITY_DISSIPATION = 0.999  # per-step velocity decay (1.0 = no decay)
+DYE_DISSIPATION = 0.994  # per-step dye decay; ink lingers and builds, then fades
+SPLAT_RADIUS = 24.0  # brush radius in cells for mouse injection
+SPLAT_FORCE = 6.0  # how strongly a mouse delta becomes velocity
+SPLAT_DYE = 1.0  # mouse ink intensity (color * this is deposited)
+CURL_SEED = 2.5  # strength of the ambient curl seeded on reset
+# Vorticity confinement pushes velocity back toward regions of high |curl|,
+# sharpening the swirls that numerical diffusion would otherwise smear out.
+# This is the single extra kernel that turns soft blobs into crisp curling
+# plumes. Tunable: ~0.1-0.3 reads well at DT=1.0; higher gets turbulent.
+VORTICITY = 0.28  # confinement strength (0.0 disables it)
+
+# Auto-bursts keep the simulation alive and colorful without any input: when
+# the mouse is idle we periodically drop a big blob of a random bright color
+# with a random velocity impulse at a random spot -- the classic "ink dropped
+# in water" look that quickly fills the frame with billowing, swirling color.
+# Grab the cursor and drag to paint your own ink.
+AUTO_EMIT = True
+BURST_INTERVAL = 0.45  # seconds between automatic colored bursts
+BURSTS_PER_EVENT = 2  # blobs dropped each burst event
+BURST_RADIUS = 42.0  # blob radius in cells (big, soft)
+BURST_FORCE = 18.0  # velocity impulse magnitude per blob
+BURST_DYE = 1.2  # ink intensity per blob (random color * this)
+
+# This solver advances one step per displayed frame, so its per-step rates
+# (dissipation, advection distance) would otherwise depend on the frame rate --
+# on a fast GPU the dye would dissipate away almost instantly between bursts.
+# We make it frame-rate INDEPENDENT instead: every frame, the real elapsed time
+# is expressed in units of a REF_FPS reference step and the dissipation and
+# advection distance are scaled by it, so the ink evolves at the same wall-clock
+# rate (and looks the same) whether the loop runs at 60 or 2000 FPS. Running
+# faster just means more, smaller, smoother substeps.
+REF_FPS = 60.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and a shared LaunchConfig (every
+    kernel is pixel-parallel over the same WIDTH x HEIGHT grid).
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=(
+            "seed_field",
+            "splat",
+            "advect_velocity",
+            "vorticity_confinement",
+            "divergence",
+            "pressure_jacobi",
+            "subtract_gradient",
+            "advect_dye",
+            "colorize",
+        ),
+    )
+
+    kernels = {
+        "seed": mod.get_kernel("seed_field"),
+        "splat": mod.get_kernel("splat"),
+        "advect_vel": mod.get_kernel("advect_velocity"),
+        "vorticity": mod.get_kernel("vorticity_confinement"),
+        "divergence": mod.get_kernel("divergence"),
+        "jacobi": mod.get_kernel("pressure_jacobi"),
+        "subtract": mod.get_kernel("subtract_gradient"),
+        "advect_dye": mod.get_kernel("advect_dye"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    return dev, stream, kernels, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Stable Fluids",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ============================ API MAP (cuda.core) ===========================
+#
+# The three helpers below are where every CUDAArray / ResourceDescriptor /
+# TextureDescriptor / TextureObject / SurfaceObject knob in this example is set.
+# Each visible setting maps to a concrete piece of cuda.core / CUDA behavior:
+#
+#   CUDAArray.from_descriptor(...)   -> allocates a CUDA *array* (opaque, tiled
+#                                       layout optimized for 2D texture fetches),
+#                                       not linear device memory.
+#   ArrayFormat.FLOAT32              -> each channel is a 32-bit float texel.
+#   num_channels=2 / num_channels=1  -> float2 (vx, vy) vs scalar (pressure /
+#                                       divergence / dye); also fixes the
+#                                       surf2Dwrite byte offset per element.
+#   is_surface_load_store=True       -> the SAME array can be bound both as a
+#                                       TextureObject (cached, filtered READS)
+#                                       and as a SurfaceObject (raw WRITES). This
+#                                       is what lets each field be sampled and
+#                                       then written back in the ping-pong.
+#
+#   ResourceDescriptor.from_array(arr) -> wraps the CUDAArray as the resource a
+#                                         TextureObject reads from.
+#   FilterMode.LINEAR                -> free HARDWARE bilinear interpolation;
+#                                       this is what makes semi-Lagrangian
+#                                       advection a single tex2D fetch at a
+#                                       fractional back-traced position (no
+#                                       manual lerp, no neighbor gather).
+#   AddressMode.CLAMP                -> bounded box boundary: out-of-range traces
+#                                       read the edge texel (ink piles up at the
+#                                       walls instead of wrapping like a torus).
+#   ReadMode.ELEMENT_TYPE            -> return the stored float value as-is (no
+#                                       integer->[0,1] normalization of texels).
+#   normalized_coords=True           -> sample in [0, 1) so CLAMP is well-defined
+#                                       and texel centers are (i + 0.5) / N.
+#
+#   SurfaceObject.from_array(arr)    -> binds the array for surf2Dread/surf2Dwrite.
+#                                       The x coordinate is in BYTES, so it is
+#                                       x * sizeof(elem): sizeof(float2)=8 for
+#                                       velocity, sizeof(float)=4 for the scalars.
+# ============================================================================
+
+
+def make_velocity_array():
+    """Allocate a `float2` velocity CUDA array (channel 0 = vx, channel 1 = vy)."""
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+
+
+def make_scalar_array():
+    """Allocate a single-channel `float` CUDA array (pressure / divergence / dye)."""
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_color_array():
+    """Allocate a `float4` RGBA dye CUDA array.
+
+    The dye carries a full color per cell (not just a density), so different
+    bursts inject different hues that advect and mix. Same LINEAR sampling and
+    surface-write machinery as the scalar fields -- only the channel count
+    (and the surf2Dwrite byte stride, sizeof(float4) = 16) differ.
+    """
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        is_surface_load_store=True,
+    )
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.
+
+    One descriptor serves every read in this example: semi-Lagrangian advection
+    needs the bilinear interpolation, and the stencil reads (divergence, Jacobi,
+    gradient) sample exactly at texel centers so LINEAR returns the exact value.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # Normalized coordinates keep CLAMP addressing well-defined and let us
+        # sample at texel centers as (i + 0.5) / N.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_field(stream, kernels, config, vel_surf, dye_surf, prs_surf, seed_value):
+    """Reset the field: gentle ambient curl in velocity, zero pressure/dye.
+
+    Takes long-lived SurfaceObjects (not freshly created ones): `launch` is
+    async, so a SurfaceObject created inside a `with` block that closes right
+    after `launch` returns would destroy the handle before the kernel runs.
+    """
+    launch(
+        stream,
+        config,
+        kernels["seed"],
+        np.uint64(vel_surf.handle),
+        np.uint64(dye_surf.handle),
+        np.uint64(prs_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.float32(CURL_SEED),
+        np.uint32(seed_value),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the simulation fields ---
+    #     velocity (float2) and dye (float) ping-pong; pressure (float)
+    #     ping-pongs across Jacobi iterations; divergence (float) is a single
+    #     scratch target written once per frame.
+    vel_a = make_velocity_array()
+    vel_b = make_velocity_array()
+    prs_a = make_scalar_array()
+    prs_b = make_scalar_array()
+    div = make_scalar_array()
+    dye_a = make_color_array()
+    dye_b = make_color_array()
+
+    # --- Step 7: Pre-create every bindless handle ONCE ---
+    #     Creating texture/surface objects is comparatively expensive, and they
+    #     must outlive the async launches that reference them, so we build them
+    #     up front and keep them alive for the whole run.
+    #     API MAP: make_texture binds an array as a read-only TextureObject
+    #     (LINEAR + CLAMP + normalized; see the API MAP block above), while
+    #     SurfaceObject.from_array binds the SAME array for raw surf2Dwrite
+    #     writes -- the read/write halves of one ping-pong buffer.
+    vel_tex_a = make_texture(vel_a)
+    vel_tex_b = make_texture(vel_b)
+    vel_surf_a = SurfaceObject.from_array(vel_a)
+    vel_surf_b = SurfaceObject.from_array(vel_b)
+
+    prs_tex_a = make_texture(prs_a)
+    prs_tex_b = make_texture(prs_b)
+    prs_surf_a = SurfaceObject.from_array(prs_a)
+    prs_surf_b = SurfaceObject.from_array(prs_b)
+
+    div_tex = make_texture(div)
+    div_surf = SurfaceObject.from_array(div)
+
+    dye_tex_a = make_texture(dye_a)
+    dye_tex_b = make_texture(dye_b)
+    dye_surf_a = SurfaceObject.from_array(dye_a)
+    dye_surf_b = SurfaceObject.from_array(dye_b)
+
+    # --- Step 8: Seed the initial field (curl into vel_a, zero pressure/dye) ---
+    seed_field(stream, kernels, config, vel_surf_a, dye_surf_a, prs_surf_a, seed_value=0)
+    stream.sync()
+
+    # `vel` / `dye` track which ping-pong array currently holds the live state.
+    state = {"vel": "a", "dye": "a", "seed": 0, "next_burst": 0.0}
+
+    # Mouse state shared with the event handlers. Coordinates are in SIMULATION
+    # space (y = 0 at top); the framebuffer has y = 0 at the bottom, so we flip.
+    mouse = {"down": False, "x": 0.0, "y": 0.0, "dx": 0.0, "dy": 0.0}
+
+    def vel_pair():
+        # Read live velocity, write the other buffer; returns (read_tex, write_surf, next).
+        if state["vel"] == "a":
+            return vel_tex_a, vel_surf_b, "b"
+        return vel_tex_b, vel_surf_a, "a"
+
+    def vel_live_tex():
+        return vel_tex_a if state["vel"] == "a" else vel_tex_b
+
+    def vel_live_surf():
+        return vel_surf_a if state["vel"] == "a" else vel_surf_b
+
+    def dye_pair():
+        if state["dye"] == "a":
+            return dye_tex_a, dye_surf_b, "b"
+        return dye_tex_b, dye_surf_a, "a"
+
+    def dye_live_tex():
+        return dye_tex_a if state["dye"] == "a" else dye_tex_b
+
+    def dye_live_surf():
+        return dye_surf_a if state["dye"] == "a" else dye_surf_b
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+    clock = {"last": start_time}  # wall-clock time of the previous frame
+
+    def _window_to_sim(x, y):
+        # Window: y = 0 at bottom. Simulation: y = 0 at top. Flip vertically.
+        sx = float(x)
+        sy = float(HEIGHT - 1 - y)
+        return sx, sy
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            seed_field(
+                stream,
+                kernels,
+                config,
+                vel_surf_a,
+                dye_surf_a,
+                prs_surf_a,
+                seed_value=state["seed"],
+            )
+            state["vel"] = "a"
+            state["dye"] = "a"
+            return
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        mouse["down"] = True
+        mouse["x"], mouse["y"] = _window_to_sim(x, y)
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+    @window.event
+    def on_mouse_release(_x, _y, _button, _modifiers):
+        mouse["down"] = False
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, _buttons, _modifiers):
+        # The mouse delta IS the injected velocity. Framebuffer dy is up-positive
+        # while simulation y is down-positive, so the sim-space delta is -dy.
+        mouse["down"] = True
+        mouse["x"], mouse["y"] = _window_to_sim(x, y)
+        mouse["dx"] = float(dx)
+        mouse["dy"] = float(-dy)
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        now_t = time.monotonic()
+        elapsed = now_t - start_time
+
+        # Frame-rate independence: express this frame's real duration in units of
+        # a REF_FPS reference step. `step` scales the advection distance, and the
+        # per-step dissipations are raised to `step` so their per-SECOND rate is
+        # constant no matter how fast the loop runs. Clamp to absorb the first
+        # frame and any hitch without launching a giant (unstable-looking) step.
+        dt_real = now_t - clock["last"]
+        clock["last"] = now_t
+        step = min(max(dt_real * REF_FPS, 0.0), 3.0)
+        dt_adv = DT * step
+        vel_diss = VELOCITY_DISSIPATION**step
+        dye_diss = DYE_DISSIPATION**step
+
+        # (a) Advect velocity along itself (semi-Lagrangian, tex2D LINEAR).
+        vel_read, vel_write, vel_next = vel_pair()
+        launch(
+            stream,
+            config,
+            kernels["advect_vel"],
+            np.uint64(vel_read.handle),
+            np.uint64(vel_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(dt_adv),
+            np.float32(vel_diss),
+        )
+        state["vel"] = vel_next
+
+        # (b) Splat mouse-drag velocity and colored dye into the live fields.
+        #     The injected color cycles through hues over time so dragging
+        #     paints a rainbow ribbon of ink.
+        inject = 1 if mouse["down"] else 0
+        mr, mg, mb = colorsys.hsv_to_rgb((elapsed * 0.15) % 1.0, 0.85, 1.0)
+        launch(
+            stream,
+            config,
+            kernels["splat"],
+            np.uint64(vel_live_surf().handle),
+            np.uint64(dye_live_surf().handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(mouse["x"]),
+            np.float32(mouse["y"]),
+            np.float32(mouse["dx"] * SPLAT_FORCE),
+            np.float32(mouse["dy"] * SPLAT_FORCE),
+            np.float32(SPLAT_RADIUS),
+            np.float32(mr * SPLAT_DYE),
+            np.float32(mg * SPLAT_DYE),
+            np.float32(mb * SPLAT_DYE),
+            np.int32(inject),
+        )
+
+        # (b2) When the user is not dragging, periodically drop big blobs of a
+        #      random bright color with a random velocity impulse at random
+        #      spots -- the classic "ink in water" look. Reuses the same `splat`
+        #      kernel as the mouse, just with a color argument.
+        if AUTO_EMIT and not mouse["down"] and elapsed >= state["next_burst"]:
+            state["next_burst"] = elapsed + BURST_INTERVAL
+            for _ in range(BURSTS_PER_EVENT):
+                bx = random.uniform(0.12, 0.88) * WIDTH
+                by = random.uniform(0.12, 0.88) * HEIGHT
+                ang = random.uniform(0.0, 2.0 * math.pi)
+                bfx = math.cos(ang) * BURST_FORCE
+                bfy = math.sin(ang) * BURST_FORCE
+                br, bg, bb = colorsys.hsv_to_rgb(random.random(), 0.9, 1.0)
+                launch(
+                    stream,
+                    config,
+                    kernels["splat"],
+                    np.uint64(vel_live_surf().handle),
+                    np.uint64(dye_live_surf().handle),
+                    np.int32(WIDTH),
+                    np.int32(HEIGHT),
+                    np.float32(bx),
+                    np.float32(by),
+                    np.float32(bfx),
+                    np.float32(bfy),
+                    np.float32(BURST_RADIUS),
+                    np.float32(br * BURST_DYE),
+                    np.float32(bg * BURST_DYE),
+                    np.float32(bb * BURST_DYE),
+                    np.int32(1),
+                )
+
+        # (b3) Vorticity confinement: read the live velocity through its
+        #      TextureObject, compute curl + grad|curl|, and add a force that
+        #      pushes velocity back toward high-vorticity regions -- this is the
+        #      one extra kernel that sharpens the curling plumes. Like
+        #      advect_velocity, it reads neighbor velocities, so it MUST
+        #      ping-pong (read old buffer, write the other) -- aliasing a
+        #      texture read with a surface write of the same array in one launch
+        #      is undefined.
+        if VORTICITY > 0.0:
+            vort_read, vort_write, vort_next = vel_pair()
+            launch(
+                stream,
+                config,
+                kernels["vorticity"],
+                np.uint64(vort_read.handle),
+                np.uint64(vort_write.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(dt_adv),
+                np.float32(VORTICITY),
+            )
+            state["vel"] = vort_next
+
+        # (c) Compute divergence of the live velocity field.
+        launch(
+            stream,
+            config,
+            kernels["divergence"],
+            np.uint64(vel_live_tex().handle),
+            np.uint64(div_surf.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+        )
+
+        # (d) Pressure solve: Jacobi-iterate lap(p) = div, ping-ponging pressure.
+        #     Start from a cleared pressure field (prs_a) each frame.
+        launch(
+            stream,
+            config,
+            kernels["jacobi"],
+            np.uint64(prs_tex_a.handle),  # ignored on the first pass via clear flag
+            np.uint64(div_tex.handle),
+            np.uint64(prs_surf_b.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(1),  # clear: treat the previous pressure as zero
+        )
+        # After the clearing pass the result lives in prs_b. Continue iterating.
+        prs_cur = "b"
+        for _ in range(PRESSURE_ITERS - 1):
+            if prs_cur == "b":
+                read_tex, write_surf, prs_cur = prs_tex_b, prs_surf_a, "a"
+            else:
+                read_tex, write_surf, prs_cur = prs_tex_a, prs_surf_b, "b"
+            launch(
+                stream,
+                config,
+                kernels["jacobi"],
+                np.uint64(read_tex.handle),
+                np.uint64(div_tex.handle),
+                np.uint64(write_surf.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.int32(0),  # do not clear: read the previous pressure
+            )
+        # `prs_cur` now names the buffer holding the converged pressure.
+        prs_final_tex = prs_tex_a if prs_cur == "a" else prs_tex_b
+
+        # (e) Subtract pressure gradient from the live velocity (in-place).
+        launch(
+            stream,
+            config,
+            kernels["subtract"],
+            np.uint64(prs_final_tex.handle),
+            np.uint64(vel_live_surf().handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+        )
+
+        # (f) Advect the dye along the (now divergence-free) velocity field.
+        dye_read, dye_write, dye_next = dye_pair()
+        launch(
+            stream,
+            config,
+            kernels["advect_dye"],
+            np.uint64(dye_read.handle),
+            np.uint64(vel_live_tex().handle),
+            np.uint64(dye_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(dt_adv),
+            np.float32(dye_diss),
+        )
+        state["dye"] = dye_next
+
+        # (g) Colorize the latest dye into the OpenGL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernels["colorize"],
+                np.uint64(dye_live_tex().handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (h) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (i) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # Reset the per-frame mouse delta so a held-still cursor stops pushing.
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface - Stable Fluids"
+                f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" {PRESSURE_ITERS} pressure iters)"
+                " | TextureObject[LINEAR|CLAMP|norm|float2]"
+                " + SurfaceObject writes + GraphicsResource(PBO)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly to be deterministic about ordering.
+        resource.close()
+        dye_tex_a.close()
+        dye_tex_b.close()
+        dye_surf_a.close()
+        dye_surf_b.close()
+        div_tex.close()
+        div_surf.close()
+        prs_tex_a.close()
+        prs_tex_b.close()
+        prs_surf_a.close()
+        prs_surf_b.close()
+        vel_tex_a.close()
+        vel_tex_b.close()
+        vel_surf_a.close()
+        vel_surf_b.close()
+        dye_a.close()
+        dye_b.close()
+        div.close()
+        prs_a.close()
+        prs_b.close()
+        vel_a.close()
+        vel_b.close()
+        stream.close()
+
+    # Render as fast as the GPU allows; the per-step rates are scaled by real
+    # elapsed time (see REF_FPS) so the look is frame-rate independent.
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains the eight CUDA C++ kernels of the Stable Fluids
+#     pipeline. Reads go through cudaTextureObject_t (LINEAR + CLAMP +
+#     normalized coords); writes go through cudaSurfaceObject_t with the x
+#     offset in BYTES. A small helper converts pixel coords to normalized
+#     texel-center coords.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Sample a float2 (velocity) field at pixel center (px, py) with bilinear
+// filtering. CLAMP addressing keeps out-of-range traces at the border.
+__device__ __forceinline__
+float2 sample_vec(cudaTextureObject_t tex, float px, float py,
+                  int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float2>(tex, u, v);
+}
+
+// Sample a scalar (float) field at pixel center (px, py) with bilinear filtering.
+__device__ __forceinline__
+float sample_scalar(cudaTextureObject_t tex, float px, float py,
+                    int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float>(tex, u, v);
+}
+
+// Sample a float4 (RGBA dye) field at pixel center with bilinear filtering.
+__device__ __forceinline__
+float4 sample_color(cudaTextureObject_t tex, float px, float py,
+                    int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float4>(tex, u, v);
+}
+
+extern "C"
+__global__
+void seed_field(cudaSurfaceObject_t vel_surf,
+                cudaSurfaceObject_t dye_surf,
+                cudaSurfaceObject_t prs_surf,
+                int width, int height,
+                float curl, unsigned int seed) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Seed a gentle global rotation: velocity perpendicular to the radius from
+    // the center gives a curl, so even with no mouse input there is motion.
+    float cx = width * 0.5f;
+    float cy = height * 0.5f;
+    float rx = (x - cx) / cx;   // ~[-1, 1]
+    float ry = (y - cy) / cy;
+    float2 vel = make_float2(-ry * curl, rx * curl);
+
+    // A touch of deterministic noise so successive resets look a little
+    // different and to break perfect symmetry.
+    unsigned int h = (unsigned int)x * 374761393u +
+                     (unsigned int)y * 668265263u + seed * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float noise = ((h & 0xffffu) / 65535.0f) - 0.5f;   // [-0.5, 0.5]
+    vel.x += noise * 0.2f;
+    vel.y += noise * 0.2f;
+
+    // Dye starts black; the colored bursts (or the mouse) paint the ink, so
+    // there is nothing to seed here beyond clearing to zero.
+    surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y);
+    surf2Dwrite(make_float4(0.0f, 0.0f, 0.0f, 0.0f), dye_surf,
+                x * (int)sizeof(float4), y);
+    surf2Dwrite(0.0f, prs_surf, x * (int)sizeof(float), y);
+}
+
+// Inject mouse-drag velocity and dye into a soft radial brush around the
+// cursor. In-place read-modify-write: each thread owns its own cell, no race.
+extern "C"
+__global__
+void splat(cudaSurfaceObject_t vel_surf,
+           cudaSurfaceObject_t dye_surf,
+           int width, int height,
+           float mx, float my,
+           float fx, float fy,
+           float radius, float dr, float dg, float db,
+           int inject) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    if (!inject) return;
+
+    float dx = (float)x - mx;
+    float dy = (float)y - my;
+    float d2 = dx * dx + dy * dy;
+    float falloff = expf(-d2 / (radius * radius));
+    if (falloff < 1e-3f) return;
+
+    float2 vel;
+    surf2Dread(&vel, vel_surf, x * (int)sizeof(float2), y);
+    vel.x += fx * falloff;
+    vel.y += fy * falloff;
+    surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y);
+
+    // Additive colored ink. float4 surface element is 16 bytes.
+    float4 dye;
+    surf2Dread(&dye, dye_surf, x * (int)sizeof(float4), y);
+    dye.x += dr * falloff;
+    dye.y += dg * falloff;
+    dye.z += db * falloff;
+    dye.w = 1.0f;
+    surf2Dwrite(dye, dye_surf, x * (int)sizeof(float4), y);
+}
+
+// Semi-Lagrangian advection of the velocity field along itself.
+extern "C"
+__global__
+void advect_velocity(cudaTextureObject_t vel_tex,
+                     cudaSurfaceObject_t vel_out,
+                     int width, int height,
+                     float dt, float dissipation) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height);
+    // Trace this cell's center backward along the velocity field.
+    float px = (float)x - dt * v.x;
+    float py = (float)y - dt * v.y;
+    float2 advected = sample_vec(vel_tex, px, py, width, height);
+    advected.x *= dissipation;
+    advected.y *= dissipation;
+    surf2Dwrite(advected, vel_out, x * (int)sizeof(float2), y);
+}
+
+// Vorticity confinement. Curl of a 2D velocity field is the scalar
+// w = dVy/dx - dVx/dy. Where |w| has a gradient we add a force that pushes
+// velocity along the swirl, reinjecting the small-scale rotation that
+// numerical diffusion smears away -- the result is crisper, longer-lived
+// curls. Reads neighbor velocities through the TextureObject and writes the
+// updated velocity to a SEPARATE ping-pong buffer (no read/write aliasing).
+__device__ __forceinline__
+float curl_at(cudaTextureObject_t vel_tex, float px, float py,
+              int width, int height) {
+    float2 l = sample_vec(vel_tex, px - 1.0f, py, width, height);
+    float2 r = sample_vec(vel_tex, px + 1.0f, py, width, height);
+    float2 d = sample_vec(vel_tex, px, py - 1.0f, width, height);
+    float2 u = sample_vec(vel_tex, px, py + 1.0f, width, height);
+    return 0.5f * ((r.y - l.y) - (u.x - d.x));
+}
+
+extern "C"
+__global__
+void vorticity_confinement(cudaTextureObject_t vel_tex,
+                           cudaSurfaceObject_t vel_out,
+                           int width, int height,
+                           float dt, float eps) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float fx = (float)x;
+    float fy = (float)y;
+
+    // Curl at this cell and at the 4 neighbors (for grad|curl|).
+    float w = curl_at(vel_tex, fx, fy, width, height);
+    float wl = curl_at(vel_tex, fx - 1.0f, fy, width, height);
+    float wr = curl_at(vel_tex, fx + 1.0f, fy, width, height);
+    float wd = curl_at(vel_tex, fx, fy - 1.0f, width, height);
+    float wu = curl_at(vel_tex, fx, fy + 1.0f, width, height);
+
+    // Gradient of |curl|, normalized to a unit direction N.
+    float gx = 0.5f * (fabsf(wr) - fabsf(wl));
+    float gy = 0.5f * (fabsf(wu) - fabsf(wd));
+    float len = sqrtf(gx * gx + gy * gy) + 1e-5f;
+    float nx = gx / len;
+    float ny = gy / len;
+
+    // Confinement force = eps * (N x w_hat). In 2D: (N_y * w, -N_x * w).
+    float2 v = sample_vec(vel_tex, fx, fy, width, height);
+    v.x += eps * dt * (ny * w);
+    v.y += eps * dt * (-nx * w);
+    surf2Dwrite(v, vel_out, x * (int)sizeof(float2), y);
+}
+
+// Divergence of the velocity field (central differences), written as a scalar.
+extern "C"
+__global__
+void divergence(cudaTextureObject_t vel_tex,
+                cudaSurfaceObject_t div_out,
+                int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 l = sample_vec(vel_tex, (float)x - 1.0f, (float)y, width, height);
+    float2 r = sample_vec(vel_tex, (float)x + 1.0f, (float)y, width, height);
+    float2 d = sample_vec(vel_tex, (float)x, (float)y - 1.0f, width, height);
+    float2 u = sample_vec(vel_tex, (float)x, (float)y + 1.0f, width, height);
+
+    float div = 0.5f * ((r.x - l.x) + (u.y - d.y));
+    surf2Dwrite(div, div_out, x * (int)sizeof(float), y);
+}
+
+// One Jacobi iteration of lap(p) = div. With unit grid spacing the update is
+// p = (p_left + p_right + p_down + p_up - div) / 4. When `clear` is set the
+// previous pressure is treated as zero so the first pass starts clean.
+extern "C"
+__global__
+void pressure_jacobi(cudaTextureObject_t prs_tex,
+                     cudaTextureObject_t div_tex,
+                     cudaSurfaceObject_t prs_out,
+                     int width, int height,
+                     int clear) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float pl = 0.0f, pr = 0.0f, pd = 0.0f, pu = 0.0f;
+    if (!clear) {
+        pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height);
+        pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height);
+        pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height);
+        pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height);
+    }
+    float div = sample_scalar(div_tex, (float)x, (float)y, width, height);
+    float p = (pl + pr + pd + pu - div) * 0.25f;
+    surf2Dwrite(p, prs_out, x * (int)sizeof(float), y);
+}
+
+// v <- v - grad(p): project the velocity onto its divergence-free part.
+extern "C"
+__global__
+void subtract_gradient(cudaTextureObject_t prs_tex,
+                       cudaSurfaceObject_t vel_surf,
+                       int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height);
+    float pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height);
+    float pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height);
+    float pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height);
+
+    float2 v;
+    surf2Dread(&v, vel_surf, x * (int)sizeof(float2), y);
+    v.x -= 0.5f * (pr - pl);
+    v.y -= 0.5f * (pu - pd);
+    surf2Dwrite(v, vel_surf, x * (int)sizeof(float2), y);
+}
+
+// Semi-Lagrangian advection of the dye along the velocity field.
+extern "C"
+__global__
+void advect_dye(cudaTextureObject_t dye_tex,
+                cudaTextureObject_t vel_tex,
+                cudaSurfaceObject_t dye_out,
+                int width, int height,
+                float dt, float dissipation) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height);
+    float px = (float)x - dt * v.x;
+    float py = (float)y - dt * v.y;
+    float4 d = sample_color(dye_tex, px, py, width, height);
+    d.x *= dissipation;
+    d.y *= dissipation;
+    d.z *= dissipation;
+    d.w *= dissipation;
+    surf2Dwrite(d, dye_out, x * (int)sizeof(float4), y);
+}
+
+// Tonemap the accumulated float4 dye color into the PBO. The ink color is
+// whatever the bursts/mouse injected and advection mixed; we apply a filmic
+// 1 - exp(-c) curve so dense ink stays vivid without harshly clipping.
+extern "C"
+__global__
+void colorize(cudaTextureObject_t dye_tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float4 c = sample_color(dye_tex, (float)x, (float)y, width, height);
+    const float gain = 1.3f;
+    float r = 1.0f - expf(-fmaxf(c.x, 0.0f) * gain);
+    float g = 1.0f - expf(-fmaxf(c.y, 0.0f) * gain);
+    float b = 1.0f - expf(-fmaxf(c.z, 0.0f) * gain);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_jfa_voronoi.py b/cuda_core/examples/gl_interop_jfa_voronoi.py
new file mode 100644
index 00000000000..bd9bead75f4
--- /dev/null
+++ b/cuda_core/examples/gl_interop_jfa_voronoi.py
@@ -0,0 +1,940 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Voronoi diagram
+# is computed every frame with the Jump Flood Algorithm (JFA): a float2 "nearest
+# seed" map is ping-ponged between two CUDA arrays across log2(N) passes. Each
+# pass reads the previous map through a POINT-filtered TextureObject (exact texel
+# reads -- no interpolation) and writes the refined map through a SurfaceObject.
+# The final nearest-seed map is colorized straight into an OpenGL PBO as neon
+# Voronoi cells or glowing metaballs. Seeds drift continuously so it animates.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same
+#   memory can be bound as both a TextureObject (for sampled reads) and a
+#   SurfaceObject (for typed writes).
+# - How to use FilterMode.POINT + AddressMode.BORDER + border_color +
+#   non-normalized coordinates to get EXACT texel reads with a clean
+#   "off-grid = no seed" sentinel. JFA fundamentally requires reading the
+#   precise value stored at an integer neighbor offset -- bilinear interpolation
+#   between two different seed coordinates would be meaningless. This is the
+#   deliberate inverse of the reaction-diffusion example's LINEAR/WRAP/normalized
+#   choice.
+#   API MAP: FilterMode.POINT -> exact texel reads (JFA needs no interpolation);
+#   AddressMode.BORDER + border_color -> off-grid neighbor fetches return a
+#   "no seed" sentinel instead of CLAMP-replicating an edge seed.
+# - How varying the read offset (the JFA "step") each pass, combined with
+#   ping-pong surface writes, propagates seed information across the whole image
+#   in O(log N) passes instead of O(N).
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire pipeline never leaves the GPU.
+#
+# How it works
+# ============
+# The Jump Flood Algorithm computes, for every pixel, the coordinate of its
+# nearest seed. We store that coordinate in a `float2` map (channel 0 = seed x,
+# channel 1 = seed y), using the sentinel (-1, -1) for "no seed known yet".
+#
+#   1. seed_clear   -- fill the whole map with the sentinel.
+#   2. seed_splat   -- for each seed, write its own (x, y) into the cell it
+#                      occupies. One tiny 1-thread launch per seed (seeds live
+#                      in a host numpy array and are passed as scalar params;
+#                      see "Why splat seeds as scalars" below).
+#   3. jfa_step     -- the heart of the algorithm. With the current step size s
+#                      (s = K, K/2, ..., 1), every pixel examines itself and its
+#                      8 neighbors at offset +/- s. Among all non-sentinel seed
+#                      coordinates found, it keeps the one closest to this pixel
+#                      and writes it out. Run once per step size, ping-ponging
+#                      the two arrays each pass.
+#   4. colorize     -- read the final nearest-seed map and write RGBA bytes
+#                      into the OpenGL PBO.
+#
+#   PING-PONG over JFA passes (two arrays, swap each pass)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+  tex2D<float2>   +--------------+
+#   |   arr_read   | ---------------> |              |
+#   | nearest-seed |  (POINT, exact   |  jfa_step    |
+#   |     map      |   texel reads at |   (step s)   |
+#   +--------------+   +/- step)      |              |
+#                                     |              |
+#   +--------------+  surf2Dwrite     |              |
+#   |   arr_write  | <--------------- |              |
+#   | nearest-seed |                  +--------------+
+#   |     map      |
+#   +--------------+
+#       (swap, halve step)
+#
+# The step schedule starts at K = next power of two >= max(W, H) / 2 and halves
+# down to 1, giving floor(log2(K)) + 1 passes. Because we ping-pong every pass,
+# the final result lands in whichever array was written last; we track that
+# explicitly (see the loop in on_draw) rather than assuming it is a fixed array.
+# The full JFA is re-run from scratch every frame because the seeds move.
+#
+# Why POINT + BORDER + border_color + non-normalized coords?
+# -----------------------------------------------------------
+# JFA reads the exact seed coordinate stored at a specific integer neighbor.
+# LINEAR filtering would blend two stored coordinates into a meaningless
+# average, so we use FilterMode.POINT. For the addressing mode we use BORDER
+# with an explicit border_color equal to the map's "no seed" sentinel
+# (-1, -1). The earlier version used CLAMP, but CLAMP makes an off-edge
+# neighbor lookup silently return the *edge* texel's real seed coordinate; that
+# can make a border pixel pick a seed that is not actually its nearest one.
+# BORDER instead returns the sentinel for any out-of-range fetch, which the
+# kernel ignores -- the correct "there is no neighbor here" answer. (WRAP and
+# MIRROR are the only address modes that require normalized coordinates; BORDER
+# and CLAMP work with non-normalized coords, so we keep the integer-style
+# sampling.) With non-normalized coordinates a texel at integer (nx, ny) is read
+# at `tex2D<float2>(tex, nx + 0.5f, ny + 0.5f)` -- the +0.5 lands on the texel
+# center. This is intentionally the opposite of the LINEAR/WRAP/normalized
+# choice used by the reaction-diffusion example.
+#
+# Why splat seeds as scalars (no device buffer)?
+# ----------------------------------------------
+# Seeds live in a host numpy array and drift via sin/cos on the CPU each frame.
+# Rather than allocating a device buffer, we pass each seed's position to a tiny
+# 1-thread `seed_splat` kernel as float scalars. With only tens of seeds this is
+# a handful of trivial launches per frame. Note the seed *list* is only needed
+# for splatting: colorize and the cell-border test read seed coordinates back
+# out of the JFA map, never from the host list.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this
+# wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window of animated, drifting Voronoi cells (smooth vivid per-cell neon
+# colors with glowing seams) or shimmering metaball-style blobs. Press M to
+# toggle the two modes,
+# +/- to change the seed count, R to reseed, and Escape to exit. The window
+# title shows the mode, seed count, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+MAX_SEEDS = 64  # upper bound on the seed count (host array is sized for this)
+DEFAULT_SEEDS = 16
+MIN_SEEDS = 2
+
+# Visual modes for the colorize kernel. The integer value is passed to the
+# kernel; the label is shown in the window caption.
+MODE_VORONOI = 0
+MODE_METABALL = 1
+MODE_LABELS = {MODE_VORONOI: "voronoi", MODE_METABALL: "metaball"}
+
+
+def jfa_steps(width, height):
+    """Return the JFA step schedule: K, K/2, ..., 1.
+
+    K is the next power of two >= max(width, height) / 2. The number of passes
+    is floor(log2(K)) + 1.
+    """
+    longest = max(width, height)
+    step = 1
+    while step < longest // 2:
+        step *= 2
+    steps = []
+    while step >= 1:
+        steps.append(step)
+        step //= 2
+    return steps
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("seed_clear", "seed_splat", "jfa_step", "colorize"),
+    )
+
+    kernels = {
+        "seed_clear": mod.get_kernel("seed_clear"),
+        "seed_splat": mod.get_kernel("seed_splat"),
+        "jfa_step": mod.get_kernel("jfa_step"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    grid_config = LaunchConfig(grid=grid, block=block)
+    # seed_clear, jfa_step, and colorize are pixel-parallel over a WIDTH x HEIGHT
+    # grid and can share this config. seed_splat is a single 1-thread launch.
+    point_config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1))
+    configs = {
+        "seed_clear": grid_config,
+        "jfa_step": grid_config,
+        "colorize": grid_config,
+        "seed_splat": point_config,
+    }
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - JFA Voronoi",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two `float2` ping-pong arrays that hold the nearest-seed map."""
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for POINT + BORDER + non-normalized.
+
+    API MAP:
+      FilterMode.POINT            -> exact texel reads (JFA needs no interpolation)
+      AddressMode.BORDER          -> off-grid neighbor fetches return border_color
+      border_color (sentinel)     -> a "no seed" value the kernel ignores, instead
+                                     of CLAMP-replicating a real edge seed
+
+    JFA needs exact texel reads at integer neighbor offsets, so we use POINT
+    filtering (no interpolation). We address with BORDER + an explicit
+    border_color set to the same "no seed" sentinel as the map's empty cells
+    (x = -1). When a JFA neighbor lookup lands off the grid, the texture unit
+    returns that sentinel and the kernel ignores it. This is strictly more
+    correct than CLAMP: with CLAMP an off-edge fetch silently replicates the
+    edge texel's seed, which can pull a border pixel toward a seed that is not
+    actually its nearest one. BORDER turns those out-of-range fetches into a
+    clean "no candidate".
+
+    Note on coordinates: BORDER addressing is valid with non-normalized
+    coordinates (only WRAP/MIRROR require normalized coords), so we keep the
+    integer-style `(nx + 0.5)` sampling used throughout the JFA. The border
+    sentinel is a 4-tuple because the descriptor always carries four channels;
+    a float2 read consumes channels 0-1, so (-1, -1) lands in (.x, .y) and the
+    trailing (0, 0) is unused.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.BORDER,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,
+        border_color=(-1.0, -1.0, 0.0, 0.0),
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def make_seeds(count):
+    """Create `count` drifting seeds.
+
+    Each seed has a base position, an angular speed, and a radius. The instant
+    position is recomputed every frame from these via sin/cos. Returns a dict of
+    numpy arrays sized for MAX_SEEDS (only the first `count` are used).
+    """
+    rng = np.random.default_rng()
+    return {
+        "base_x": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * WIDTH,
+        "base_y": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * HEIGHT,
+        "radius": rng.uniform(0.05, 0.25, MAX_SEEDS).astype(np.float32) * min(WIDTH, HEIGHT),
+        "phase": rng.uniform(0.0, 2.0 * math.pi, MAX_SEEDS).astype(np.float32),
+        "speed": rng.uniform(0.3, 1.2, MAX_SEEDS).astype(np.float32),
+        "count": count,
+    }
+
+
+def seed_positions(seeds, t):
+    """Return (xs, ys) instant positions for the active seeds at time `t`.
+
+    Seeds drift along small circles via sin/cos so the Voronoi diagram animates
+    smoothly. Positions are clamped to the interior of the image.
+    """
+    n = seeds["count"]
+    ang = seeds["phase"][:n] + seeds["speed"][:n] * t
+    xs = seeds["base_x"][:n] + seeds["radius"][:n] * np.cos(ang)
+    ys = seeds["base_y"][:n] + seeds["radius"][:n] * np.sin(ang)
+    xs = np.clip(xs, 0.0, WIDTH - 1.0).astype(np.float32)
+    ys = np.clip(ys, 0.0, HEIGHT - 1.0).astype(np.float32)
+    return xs, ys
+
+
+def run_jfa(stream, kernels, configs, seeds, t, tex_a, tex_b, surf_a, surf_b):
+    """Run a full JFA pass for the current seed positions.
+
+    Clears arr_a (via surf_a) to the sentinel, splats each seed into arr_a, then
+    ping-pongs the step loop between (tex_a/surf_a) and (tex_b/surf_b).
+
+    Returns the TextureObject bound to the array that was written last, which
+    holds the final nearest-seed map for colorize.
+    """
+    # 1. Clear arr_a to the sentinel (-1, -1).
+    launch(
+        stream,
+        configs["seed_clear"],
+        kernels["seed_clear"],
+        np.uint64(surf_a.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+    )
+
+    # 2. Splat each seed's own coordinate into arr_a (one 1-thread launch each).
+    xs, ys = seed_positions(seeds, t)
+    for i in range(seeds["count"]):
+        launch(
+            stream,
+            configs["seed_splat"],
+            kernels["seed_splat"],
+            np.uint64(surf_a.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(xs[i]),
+            np.float32(ys[i]),
+        )
+
+    # 3. Ping-pong the JFA step loop. Start reading arr_a / writing arr_b.
+    read_tex, write_surf = tex_a, surf_b
+    other_tex, other_surf = tex_b, surf_a
+    final_tex = tex_a  # if the loop body never runs, arr_a holds the result
+    for step in jfa_steps(WIDTH, HEIGHT):
+        launch(
+            stream,
+            configs["jfa_step"],
+            kernels["jfa_step"],
+            np.uint64(read_tex.handle),
+            np.uint64(write_surf.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(step),
+        )
+        # The array we just wrote is now the current map; swap for next pass.
+        final_tex = tex_b if write_surf is surf_b else tex_a
+        read_tex, other_tex = other_tex, read_tex
+        write_surf, other_surf = other_surf, write_surf
+    return final_tex
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong nearest-seed map Arrays ---
+    #     Both are `float2` (channel 0 = seed x, channel 1 = seed y) with
+    #     is_surface_load_store=True so they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles (once, kept alive) ---
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Initialize seeds and view state ---
+    state = {"mode": MODE_VORONOI, "seeds": make_seeds(DEFAULT_SEEDS)}
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.M:
+            state["mode"] = MODE_METABALL if state["mode"] == MODE_VORONOI else MODE_VORONOI
+            return
+        if symbol == key.R:
+            state["seeds"] = make_seeds(state["seeds"]["count"])
+            return
+        if symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            new_count = min(MAX_SEEDS, state["seeds"]["count"] + 1)
+            if new_count != state["seeds"]["count"]:
+                state["seeds"] = make_seeds(new_count)
+            return
+        if symbol in (key.MINUS, key.NUM_SUBTRACT):
+            new_count = max(MIN_SEEDS, state["seeds"]["count"] - 1)
+            if new_count != state["seeds"]["count"]:
+                state["seeds"] = make_seeds(new_count)
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        t = time.monotonic() - start_time
+
+        # (a) Run the full Jump Flood Algorithm for the current seed positions.
+        #     final_tex is the TextureObject over the array written last.
+        final_tex = run_jfa(stream, kernels, configs, state["seeds"], t, tex_a, tex_b, surf_a, surf_b)
+
+        # (b) Colorize the nearest-seed map into the OpenGL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(final_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.int32(state["mode"]),
+                np.float32(t),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = MODE_LABELS[state["mode"]]
+            window.set_caption(
+                "cuda.core JFA Voronoi"
+                " | TextureObject[POINT|BORDER|border_color] float2 + SurfaceObject"
+                f" | mode={label} | {state['seeds']['count']} seeds"
+                f" | {WIDTH}x{HEIGHT} | {fps:.0f} FPS"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. KERNEL_SOURCE contains four CUDA C++
+# kernels:
+#
+#   * seed_clear -- fills the map with the sentinel (-1, -1) via surface writes.
+#   * seed_splat -- writes one seed's own coordinate into the cell it occupies.
+#   * jfa_step   -- reads the previous map via a POINT-filtered, BORDER-addressed
+#                   TextureObject at +/- step offsets and writes the refined
+#                   nearest-seed map via a SurfaceObject. Off-grid fetches return
+#                   the sentinel border_color. Coordinates are non-normalized.
+#   * colorize   -- reads the final nearest-seed map and writes RGBA bytes into
+#                   the OpenGL PBO, either as smooth neon Voronoi cells with
+#                   glowing borders (mode 0) or glowing metaballs (mode 1).
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL that draw a texture on
+# a fullscreen rectangle. Nothing interesting.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// The nearest-seed map is a float2 per texel: (.x, .y) = coordinate of the
+// nearest known seed, or the sentinel (-1, -1) for "none yet". With POINT
+// filtering + non-normalized coords, texel (ix, iy) is read at
+// tex2D<float2>(tex, ix + 0.5f, iy + 0.5f). The texture is BORDER-addressed
+// with border_color == the sentinel, so a fetch with an out-of-range coord
+// also returns (-1, -1) and is rejected by is_seed() -- the same path as an
+// empty interior cell.
+
+#define SENTINEL_X (-1.0f)
+
+__device__ __forceinline__ bool is_seed(float2 s) {
+    // Any non-negative x marks a valid stored seed coordinate.
+    return s.x >= 0.0f;
+}
+
+// Fully-saturated HSV->RGB, hue/value driven by hash, returns vivid neon RGB.
+__device__ __forceinline__ void hsv_to_rgb(float hue, float sat, float val,
+                                           float* r, float* g, float* b) {
+    hue -= floorf(hue);            // wrap hue into [0, 1)
+    float h6 = hue * 6.0f;
+    float c = val * sat;
+    float x = c * (1.0f - fabsf(fmodf(h6, 2.0f) - 1.0f));
+    float m = val - c;
+    float rr, gg, bb;
+    if (h6 < 1.0f)      { rr = c; gg = x; bb = 0.0f; }
+    else if (h6 < 2.0f) { rr = x; gg = c; bb = 0.0f; }
+    else if (h6 < 3.0f) { rr = 0.0f; gg = c; bb = x; }
+    else if (h6 < 4.0f) { rr = 0.0f; gg = x; bb = c; }
+    else if (h6 < 5.0f) { rr = x; gg = 0.0f; bb = c; }
+    else                { rr = c; gg = 0.0f; bb = x; }
+    *r = rr + m; *g = gg + m; *b = bb + m;
+}
+
+// Hash a seed coordinate into a smooth, vivid per-cell neon color. The hash
+// drives a hue around the full color wheel; saturation/value stay high so
+// neighboring cells read as distinct saturated hues rather than muddy bytes.
+__device__ __forceinline__ void seed_color(float sx, float sy,
+                                           float* r, float* g, float* b) {
+    unsigned int h = (unsigned int)(sx + 0.5f) * 374761393u +
+                     (unsigned int)(sy + 0.5f) * 668265263u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float hue = (h & 0xffffu) / 65535.0f;
+    // A little value jitter from the high bits keeps equal-hue cells separable.
+    float val = 0.85f + 0.15f * (((h >> 16) & 0xffu) / 255.0f);
+    hsv_to_rgb(hue, 0.92f, val, r, g, b);
+}
+
+extern "C"
+__global__
+void seed_clear(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(make_float2(SENTINEL_X, SENTINEL_X), surf,
+                x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void seed_splat(cudaSurfaceObject_t surf, int width, int height,
+                float sx, float sy) {
+    // Single-thread launch: write this seed's own coordinate into its cell.
+    int ix = (int)(sx + 0.5f);
+    int iy = (int)(sy + 0.5f);
+    if (ix < 0) ix = 0;
+    if (ix >= width) ix = width - 1;
+    if (iy < 0) iy = 0;
+    if (iy >= height) iy = height - 1;
+    surf2Dwrite(make_float2(sx, sy), surf, ix * (int)sizeof(float2), iy);
+}
+
+extern "C"
+__global__
+void jfa_step(cudaTextureObject_t tex, cudaSurfaceObject_t surf,
+              int width, int height, int step) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float px = (float)x;
+    float py = (float)y;
+
+    float best_x = SENTINEL_X;
+    float best_y = SENTINEL_X;
+    float best_d2 = 3.0e38f;  // ~FLT_MAX
+
+    // Examine self (dx=dy=0) and the 8 neighbors at +/- step. We deliberately
+    // do NOT clamp the neighbor coordinate: off-grid lookups are left out of
+    // range so the BORDER-addressed texture returns the sentinel border_color
+    // (-1, -1). is_seed() then rejects it, exactly as it would reject an empty
+    // interior cell. Under the old CLAMP scheme an off-edge fetch returned the
+    // edge texel's real seed, which could win the nearest-seed test for a
+    // border pixel even though that seed is not actually its nearest.
+    #pragma unroll
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            int nx = x + dx * step;
+            int ny = y + dy * step;
+
+            float2 s = tex2D<float2>(tex, (float)nx + 0.5f, (float)ny + 0.5f);
+            if (is_seed(s)) {
+                float ddx = s.x - px;
+                float ddy = s.y - py;
+                float d2 = ddx * ddx + ddy * ddy;
+                if (d2 < best_d2) {
+                    best_d2 = d2;
+                    best_x = s.x;
+                    best_y = s.y;
+                }
+            }
+        }
+    }
+
+    surf2Dwrite(make_float2(best_x, best_y), surf, x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex, unsigned char* output,
+              int width, int height, int mode, float t) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 c = tex2D<float2>(tex, (float)x + 0.5f, (float)y + 0.5f);
+
+    float r = 0.0f, g = 0.0f, b = 0.0f;
+
+    if (is_seed(c)) {
+        float dx = c.x - (float)x;
+        float dy = c.y - (float)y;
+        float dist = sqrtf(dx * dx + dy * dy);
+
+        if (mode == 0) {
+            // --- Voronoi cells: smooth neon color + glowing cell borders. ---
+            seed_color(c.x, c.y, &r, &g, &b);
+
+            // Border proximity: count how many 8-neighbors belong to a different
+            // cell. A pixel deep inside a cell sees 0; a pixel right on the edge
+            // sees several. We use this as a smooth edge factor rather than a
+            // hard on/off so borders read as a luminous seam, not a jagged line.
+            int diff = 0;
+            const int ox[8] = {-1, 1, 0, 0, -1, -1, 1, 1};
+            const int oy[8] = {0, 0, -1, 1, -1, 1, -1, 1};
+            #pragma unroll
+            for (int i = 0; i < 8; ++i) {
+                int nx = x + ox[i];
+                int ny = y + oy[i];
+                if (nx < 0) nx = 0;
+                if (nx >= width) nx = width - 1;
+                if (ny < 0) ny = 0;
+                if (ny >= height) ny = height - 1;
+                float2 n = tex2D<float2>(tex, (float)nx + 0.5f, (float)ny + 0.5f);
+                if (is_seed(n) && (n.x != c.x || n.y != c.y)) {
+                    ++diff;
+                }
+            }
+
+            // Smooth interior shading: gentle radial falloff from the cell seed
+            // for a soft volumetric look, slowly breathing in time.
+            float shade = 1.0f / (1.0f + 0.0006f * dist * dist);
+            float pulse = 0.92f + 0.08f * sinf(1.5f * t + 0.02f * dist);
+            shade = (0.55f + 0.45f * shade) * pulse;
+            r *= shade; g *= shade; b *= shade;
+
+            if (diff > 0) {
+                // edge in [0,1]: stronger the more neighbors disagree.
+                float edge = (float)diff / 8.0f;
+                edge = edge * edge;  // bias toward the true seam
+                // Darken the base color toward the seam, then add a bright neon
+                // rim on top so cell boundaries glow instead of just going dark.
+                float dark = 1.0f - 0.85f * edge;
+                r *= dark; g *= dark; b *= dark;
+                float rim = edge * (0.65f + 0.35f * sinf(2.5f * t));
+                r += rim; g += rim * 0.9f; b += rim;
+            }
+        } else {
+            // --- Metaballs: glowing neon falloff from the nearest seed. ---
+            // Brightness peaks at the seed and decays smoothly with distance.
+            float glow = 1.0f / (1.0f + 0.0018f * dist * dist);
+            // A couple of animated isoline ripples add a layered plasma pulse.
+            float ripple = 0.5f + 0.5f * sinf(0.13f * dist - 3.0f * t);
+            float ripple2 = 0.5f + 0.5f * sinf(0.05f * dist + 1.7f * t);
+            float intensity = glow * (0.55f + 0.30f * ripple + 0.15f * ripple2);
+            // A soft core bloom keeps seed centers reading as hot points.
+            float core = 1.0f / (1.0f + 0.02f * dist * dist);
+            intensity += 0.5f * core;
+
+            // Hue sweeps with distance + time so blobs shimmer through the neon
+            // spectrum; value tracks intensity so falloff still fades to black.
+            float hue = 0.6f + 0.0015f * dist + 0.05f * t;
+            float val = intensity;
+            if (val > 1.0f) val = 1.0f;
+            hsv_to_rgb(hue, 0.85f, val, &r, &g, &b);
+            // Lift toward white at the very brightest cores for a hot-tip look.
+            float hot = intensity - 1.0f;
+            if (hot > 0.0f) {
+                if (hot > 1.0f) hot = 1.0f;
+                r += hot * (1.0f - r);
+                g += hot * (1.0f - g);
+                b += hot * (1.0f - b);
+            }
+        }
+    }
+
+    // Clamp to [0, 1] before writing bytes.
+    if (r < 0.0f) r = 0.0f; if (r > 1.0f) r = 1.0f;
+    if (g < 0.0f) g = 0.0f; if (g > 1.0f) g = 1.0f;
+    if (b < 0.0f) b = 0.0f; if (b > 1.0f) b = 1.0f;
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_particles.py b/cuda_core/examples/gl_interop_particles.py
new file mode 100644
index 00000000000..c5dd06e3697
--- /dev/null
+++ b/cuda_core/examples/gl_interop_particles.py
@@ -0,0 +1,688 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.GraphicsResource VBO interop together with
+# CUDAArray, SurfaceObject, and TextureObject. Hundreds of thousands of points
+# flow through an animated curl-noise velocity field. CUDA writes particle
+# positions directly into an OpenGL Vertex Buffer Object (VBO), and OpenGL draws
+# that same buffer as a glowing additive point cloud -- no PBO, no fullscreen
+# quad, no pixel copy. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to register an OpenGL VBO (GL_ARRAY_BUFFER) with CUDA using
+#   `GraphicsResource.from_gl_buffer(vbo_id, flags="none")` and treat the mapped
+#   `buf.handle` as a device pointer to a particle array that CUDA both reads and
+#   writes in place. This is the standout difference from every other interop
+#   example here: those copy CUDA output into a PBO, upload it to a texture, and
+#   draw a fullscreen quad. This one renders geometry straight out of the buffer
+#   CUDA just wrote.
+# - How to bake a smooth, periodic scalar potential into a 2D CUDAArray once (via
+#   a SurfaceObject write kernel), then bind that array as a LINEAR + WRAP
+#   normalized TextureObject and derive a divergence-free curl-noise velocity
+#   field from finite differences of texture samples.
+# - How to draw GL_POINTS directly from a CUDA-written VBO with additive blending
+#   and shader-controlled point size for a luminous, flowing look.
+#
+# How it works
+# ============
+# We allocate one VBO holding N particles. Each particle is 4 floats:
+#
+#     [x, y, age, speed]   (stride = 16 bytes)
+#
+#   - x, y   : position in the [0, 1] x [0, 1] domain. The vertex shader maps
+#              this to clip space with `pos * 2 - 1`. Keeping a single [0, 1]
+#              domain means the kernel can sample the velocity texture with
+#              normalized coordinates directly -- no scaling bugs.
+#   - age    : seconds since this particle last (re)spawned. Drives color and
+#              alpha; resets to 0 on respawn.
+#   - speed  : normalized flow magnitude in [0, 1] at the particle's location
+#              (the kernel maps gradient steepness through tanh). Drives the
+#              color ramp so fast jets glow hotter than calm eddies.
+#
+# The GL vertex attributes read from the same buffer:
+#   - "position" : 2 floats at offset 0
+#   - "attribs"  : 2 floats (age, speed) at offset 8
+#
+# The CUDA kernel `advance_particles` indexes the buffer as `float4*` so its
+# layout agrees with the host init array and the GL attribute pointers above.
+#
+#   VBO INTEROP (one buffer, CUDA writes -> OpenGL draws)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-------------------+   map(stream)    +---------------------+
+#   |   OpenGL VBO      | ---------------> |  advance_particles  |
+#   | float4 per point  |                  |  (curl-noise flow)  |
+#   | [x, y, age, speed]| <--------------- |  reads+writes pts   |
+#   +-------------------+   unmap          +---------------------+
+#           |
+#           |  glDrawArrays(GL_POINTS)   (after unmap; GL cannot read a
+#           v                             buffer while it is mapped to CUDA)
+#       glowing point cloud on screen
+#
+# The velocity field is a curl of a baked scalar potential P(u, v):
+#
+#     velocity = ( dP/dv, -dP/du )
+#
+# Taking the curl of a scalar potential yields a divergence-free field, so
+# particles swirl without piling up or thinning out. The potential is baked once
+# into a single-channel float CUDAArray as a sum of periodic sinusoids, then
+# sampled with LINEAR + WRAP + normalized coordinates. A time uniform scrolls the
+# sample coordinates so the whole field slowly drifts and animates.
+#
+# Why flags="none" (not "write_discard")?
+# ---------------------------------------
+# The PBO examples register with "write_discard" because they overwrite every
+# pixel each frame and never read the old contents. Here the kernel READS each
+# particle's current position before writing the advanced one, so we must NOT
+# tell CUDA the prior contents are garbage. We use "none".
+#
+# Single-channel surf2Dwrite byte offset
+# --------------------------------------
+# The potential array is single-channel `float` (4 bytes). `surf2Dwrite` takes
+# the x coordinate in BYTES, so the offset is `x * sizeof(float)` = `x * 4`.
+# (Contrast the float2 reaction-diffusion example, which uses `x * 8`.)
+#
+# What you should see
+# ===================
+# Luminous filaments of points swirling through an animated flow field, colored
+# blue -> cyan -> white by speed and faded by age. Press R to respawn all
+# particles, +/- to slow down / speed up the flow, and Escape to exit. The window
+# title shows the particle count and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 900
+HEIGHT = 900
+N_PARTICLES = 1_000_000  # number of points in the cloud
+FLOATS_PER_PARTICLE = 4  # [x, y, age, speed]
+POTENTIAL_DIM = 256  # resolution of the baked potential texture (square)
+DT = 1.0 / 60.0  # simulation time step per frame (seconds)
+BASE_SPEED = 0.15  # base flow speed (domain units per second)
+SPEED_STEP = 1.25  # multiplier applied by +/-
+MAX_AGE = 4.0  # seconds before a particle respawns
+POINT_SIZE = 2.4  # rendered point size in pixels
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about VBO
+# interop, skip ahead to main() -- the interesting part is there. These helpers
+# exist so that main() reads like a short story instead of a wall of
+# boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires bindless surface objects (cuSurfObjectCreate),
+    # which need compute capability >= 3.0.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_potential", "init_particles", "advance_particles"),
+    )
+
+    kernels = {
+        "bake": mod.get_kernel("bake_potential"),
+        "init": mod.get_kernel("init_particles"),
+        "advance": mod.get_kernel("advance_particles"),
+    }
+
+    # The potential bake is 2D over POTENTIAL_DIM x POTENTIAL_DIM texels.
+    block2d = (16, 16, 1)
+    grid2d = (
+        (POTENTIAL_DIM + block2d[0] - 1) // block2d[0],
+        (POTENTIAL_DIM + block2d[1] - 1) // block2d[1],
+        1,
+    )
+    # init/advance are 1D over N_PARTICLES.
+    block1d = (256, 1, 1)
+    grid1d = ((N_PARTICLES + block1d[0] - 1) // block1d[0], 1, 1)
+
+    configs = {
+        "bake": LaunchConfig(grid=grid2d, block=block2d),
+        "init": LaunchConfig(grid=grid1d, block=block1d),
+        "advance": LaunchConfig(grid=grid1d, block=block1d),
+    }
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core VBO interop - Curl-Noise Particle Flow",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_particle_vbo(gl, shader_prog):
+    """Create the particle VBO and its VAO, and wire up the vertex attributes.
+
+    The VBO holds N_PARTICLES * 4 floats laid out as [x, y, age, speed] per
+    particle. We initialize positions to a deterministic pseudo-random spread
+    across the [0, 1] domain so there is something to see even before the first
+    kernel launch; CUDA overwrites this every frame.
+
+    Returns (vbo_gl_name, vao_gl_name).
+    """
+    # Host-side initial layout MUST match the kernel's float4 view and the GL
+    # attribute pointers below: [x, y, age, speed] per particle.
+    init = np.empty((N_PARTICLES, FLOATS_PER_PARTICLE), dtype=np.float32)
+    rng = np.random.default_rng(12345)
+    init[:, 0] = rng.random(N_PARTICLES, dtype=np.float32)  # x in [0, 1]
+    init[:, 1] = rng.random(N_PARTICLES, dtype=np.float32)  # y in [0, 1]
+    init[:, 2] = rng.random(N_PARTICLES, dtype=np.float32) * MAX_AGE  # staggered age
+    init[:, 3] = 0.0  # speed
+    init = np.ascontiguousarray(init)
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        init.nbytes,
+        init.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_DYNAMIC_DRAW,  # CUDA rewrites this buffer every frame
+    )
+
+    stride = FLOATS_PER_PARTICLE * 4  # 4 floats * 4 bytes = 16 bytes per particle
+
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    attr_loc = gl.glGetAttribLocation(shader_prog.id, b"attribs")
+    gl.glEnableVertexAttribArray(attr_loc)
+    gl.glVertexAttribPointer(attr_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+
+    return vbo.value, vao.value
+
+
+def create_shader(gl):
+    """Build the point-cloud shader program (kept alive by the caller)."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Additive blending so overlapping points accumulate into glow, and
+    # shader-controlled point size (off by default in the core profile).
+    gl.glEnable(gl.GL_BLEND)
+    gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE)
+    gl.glEnable(gl.GL_PROGRAM_POINT_SIZE)
+    gl.glDisable(gl.GL_DEPTH_TEST)
+
+    return shader_prog
+
+
+def make_potential_array():
+    """Allocate the single-channel float CUDAArray that holds the baked potential.
+
+    `is_surface_load_store=True` lets us write it once via a SurfaceObject and
+    then read it as a TextureObject for smooth, wrapping, bilinear sampling.
+    """
+    return CUDAArray.from_descriptor(
+        shape=(POTENTIAL_DIM, POTENTIAL_DIM),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_potential_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP addressing only works with normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def reset_particles(stream, kernels, configs, resource, seed):
+    """Respawn every particle by launching init_particles on the mapped VBO.
+
+    Reuses the same map() path the per-frame advance uses, so there is no host
+    re-upload. The map brackets only the launch; GL must not touch the buffer
+    while it is mapped.
+    """
+    with resource.map(stream=stream) as buf:
+        launch(
+            stream,
+            configs["init"],
+            kernels["init"],
+            buf.handle,
+            np.int32(N_PARTICLES),
+            np.uint32(seed),
+            np.float32(MAX_AGE),
+        )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Build the point-cloud shader and enable additive blending ---
+    shader_prog = create_shader(gl)
+
+    # --- Step 4: Create the particle VBO + VAO (the buffer CUDA writes into) ---
+    vbo_id, vao_id = create_particle_vbo(gl, shader_prog)
+
+    # =======================================================================
+    # API MAP -- the four cuda.core interop objects this example hinges on
+    # =======================================================================
+    #   GraphicsResource.from_gl_buffer(VBO)
+    #       Registers a GL VBO (NOT a PBO) so CUDA writes vertex positions,
+    #       OpenGL then draws directly -- zero copy. The mapped buf.handle is a
+    #       raw device pointer into the same float4 array OpenGL renders from.
+    #   CUDAArray (single-channel float, is_surface_load_store=True)
+    #       The backing storage for the baked scalar potential.
+    #   SurfaceObject.from_array(pot_arr)
+    #       Write view used ONCE at startup to bake the potential into the array.
+    #   TextureObject (LINEAR + WRAP + normalized, 1ch)
+    #       Read view: LINEAR+WRAP+normalized lets the kernel read the baked
+    #       potential's gradient with smooth, tileable sampling -- the curl of
+    #       that gradient is the divergence-free velocity field.
+    # The texture handle is created once, kept alive, and wrapped in np.uint64
+    # at launch; buf.handle is passed raw.
+    # =======================================================================
+
+    # --- Step 5: Register the VBO with CUDA ---
+    #     flags="none": the kernel reads each particle before writing it back,
+    #     so we must NOT discard the prior contents (that's why this is not
+    #     "write_discard" like the PBO examples).
+    resource = GraphicsResource.from_gl_buffer(vbo_id, flags="none")
+
+    # --- Step 6: Allocate + bake the curl-noise potential, bind it as a texture ---
+    pot_arr = make_potential_array()
+    pot_surf = SurfaceObject.from_array(pot_arr)  # created once, kept alive
+    pot_tex = make_potential_texture(pot_arr)  # created once, kept alive
+
+    # Bake the scalar potential once via the SurfaceObject.
+    launch(
+        stream,
+        configs["bake"],
+        kernels["bake"],
+        np.uint64(pot_surf.handle),
+        np.int32(POTENTIAL_DIM),
+        np.int32(POTENTIAL_DIM),
+    )
+
+    # --- Step 7: Seed the particles into the VBO ---
+    state = {"seed": 1, "speed": BASE_SPEED, "t": 0.0}
+    reset_particles(stream, kernels, configs, resource, state["seed"])
+
+    # --- Step 8: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            reset_particles(stream, kernels, configs, resource, state["seed"])
+            return
+        if symbol in (key.PLUS, key.NUM_ADD, key.EQUAL):
+            state["speed"] *= SPEED_STEP
+            return
+        if symbol in (key.MINUS, key.NUM_SUBTRACT):
+            state["speed"] /= SPEED_STEP
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        # Black background so additive accumulation reads as glow.
+        window.clear()
+
+        state["t"] += DT
+
+        # (a) Advance particles. The map brackets ONLY the CUDA launch -- OpenGL
+        #     cannot read the buffer while it is mapped to CUDA.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["advance"],
+                kernels["advance"],
+                buf.handle,  # raw device pointer to the float4 particle array
+                np.uint64(pot_tex.handle),
+                np.int32(N_PARTICLES),
+                np.float32(DT),
+                np.float32(state["speed"]),
+                np.float32(state["t"]),
+                np.float32(MAX_AGE),
+                np.uint32(state["seed"]),
+            )
+        # Unmap happens automatically when the `with` block exits; only after
+        # that may OpenGL draw from the buffer.
+
+        # (b) Draw the particles straight from the VBO as GL_POINTS.
+        gl.glUseProgram(shader_prog.id)
+        max_age_loc = gl.glGetUniformLocation(shader_prog.id, b"max_age")
+        gl.glUniform1f(max_age_loc, MAX_AGE)
+        psize_loc = gl.glGetUniformLocation(shader_prog.id, b"point_size")
+        gl.glUniform1f(psize_loc, POINT_SIZE)
+        gl.glBindVertexArray(vao_id)
+        gl.glDrawArrays(gl.GL_POINTS, 0, N_PARTICLES)
+        gl.glBindVertexArray(0)
+        gl.glUseProgram(0)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core VBO interop - Curl-Noise Particle Flow"
+                f" ({N_PARTICLES:,} points, {fps:.0f} FPS,"
+                f" speed x{state['speed'] / BASE_SPEED:.2f})"
+                " | GraphicsResource(VBO) + TextureObject[LINEAR|WRAP|norm|1ch]"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        pot_tex.close()
+        pot_surf.close()
+        pot_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't distract
+# from the Python logic above.
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * bake_potential    -- writes a smooth, periodic scalar potential into a
+#                              single-channel float surface (once at startup).
+#       * init_particles    -- (re)spawns every particle to a pseudo-random
+#                              position with a staggered age. Operates on the
+#                              mapped VBO as a float4 array.
+#       * advance_particles -- reads each particle from the mapped VBO, samples
+#                              the potential texture, computes a divergence-free
+#                              curl velocity, integrates the position, handles
+#                              wrap/respawn, and writes the particle back.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE draw GL_POINTS from the VBO
+#     with a soft round sprite colored by speed and faded by age.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// ---- shared helpers --------------------------------------------------------
+
+// Cheap deterministic xorshift hash -> float in [0, 1).
+__device__ __forceinline__ float hash01(unsigned int h) {
+    h ^= h >> 16; h *= 0x7feb352du;
+    h ^= h >> 15; h *= 0x846ca68bu;
+    h ^= h >> 16;
+    return (h & 0x00ffffffu) / (float)0x01000000;
+}
+
+__device__ __forceinline__ unsigned int seed_of(unsigned int idx, unsigned int salt) {
+    return idx * 747796405u + salt * 2891336453u + 1u;
+}
+
+// ---- bake the scalar potential ---------------------------------------------
+//
+// A sum of periodic sinusoids over the unit square. Using full 2*pi*k periods
+// makes the field seamless under WRAP addressing -- no visible edge.
+extern "C"
+__global__
+void bake_potential(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = (x + 0.5f) / (float)width;   // [0, 1)
+    float v = (y + 0.5f) / (float)height;  // [0, 1)
+    const float TWO_PI = 6.2831853f;
+
+    float p = 0.0f;
+    p += 1.00f * sinf(TWO_PI * (1.0f * u + 0.0f * v) + 0.3f);
+    p += 0.70f * sinf(TWO_PI * (0.0f * u + 1.0f * v) + 1.7f);
+    p += 0.55f * sinf(TWO_PI * (1.0f * u + 1.0f * v) + 2.1f);
+    p += 0.45f * sinf(TWO_PI * (2.0f * u - 1.0f * v) + 0.9f);
+    p += 0.30f * sinf(TWO_PI * (-1.0f * u + 2.0f * v) + 4.2f);
+    p += 0.25f * sinf(TWO_PI * (3.0f * u + 2.0f * v) + 5.5f);
+
+    // Single-channel float surface: x offset is in BYTES = x * sizeof(float).
+    surf2Dwrite(p, surf, x * (int)sizeof(float), y);
+}
+
+// ---- (re)spawn particles ---------------------------------------------------
+//
+// The VBO is a flat array of float4 [x, y, age, speed] per particle.
+extern "C"
+__global__
+void init_particles(float4* particles, int n,
+                    unsigned int seed, float max_age) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    unsigned int s = seed_of((unsigned int)i, seed);
+    float px = hash01(s + 11u);
+    float py = hash01(s + 53u);
+    // Stagger ages so respawns don't pulse in lockstep.
+    float age = hash01(s + 97u) * max_age;
+    particles[i] = make_float4(px, py, age, 0.0f);
+}
+
+// ---- advance particles through the curl-noise field ------------------------
+extern "C"
+__global__
+void advance_particles(float4* particles,
+                       cudaTextureObject_t pot,
+                       int n, float dt, float speed,
+                       float t, float max_age,
+                       unsigned int seed) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    float4 p = particles[i];
+    float x = p.x;
+    float y = p.y;
+    float age = p.z;
+
+    // Scroll the sample coordinates slowly with time so the field animates.
+    float scroll = 0.03f * t;
+    float su = x + scroll;
+    float sv = y - 0.5f * scroll;
+
+    // Curl of a scalar potential P is (dP/dv, -dP/du): divergence-free flow.
+    // Estimate the gradient by central differences of texture samples. The
+    // texture is LINEAR + WRAP + normalized, so wrapped reads are seamless.
+    const float eps = 1.0f / 256.0f;
+    float p_up = tex2D<float>(pot, su, sv + eps);
+    float p_dn = tex2D<float>(pot, su, sv - eps);
+    float p_rt = tex2D<float>(pot, su + eps, sv);
+    float p_lt = tex2D<float>(pot, su - eps, sv);
+
+    float dP_dv = (p_up - p_dn) / (2.0f * eps);
+    float dP_du = (p_rt - p_lt) / (2.0f * eps);
+
+    // Curl direction, then bound the magnitude. The raw analytic gradient of
+    // the summed sinusoids runs ~0..20, which (times speed) would whip every
+    // particle across the domain in well under a second and saturate the color
+    // ramp. We split it: `dir` is the flow direction, and `flow` maps the
+    // gradient steepness through tanh into [0, 1] so the field has slow eddies
+    // and fast jets. The displacement is `speed * flow` domain-units/sec, so
+    // `speed` is a true unit-per-second knob and `flow` drives the color ramp.
+    float gx = dP_dv;
+    float gy = -dP_du;
+    float grad = sqrtf(gx * gx + gy * gy) + 1e-6f;
+    float flow = tanhf(grad * 0.12f);  // 0 in calm regions, ->1 in steep jets
+    float vx = speed * flow * (gx / grad);
+    float vy = speed * flow * (gy / grad);
+
+    // Store `flow` (the normalized speed in [0, 1]) as the color driver.
+    float vmag = flow;
+
+    // Integrate position.
+    x += vx * dt;
+    y += vy * dt;
+    age += dt;
+
+    // Respawn on age expiry or if a particle drifts out of the unit domain.
+    bool respawn = (age >= max_age) || x < 0.0f || x > 1.0f || y < 0.0f || y > 1.0f;
+    if (respawn) {
+        // Jitter the seed by frame-ish state so respawns spread out over time.
+        unsigned int s = seed_of((unsigned int)i, seed + (unsigned int)(t * 60.0f));
+        x = hash01(s + 11u);
+        y = hash01(s + 53u);
+        age = 0.0f;
+        vmag = 0.0f;
+    }
+
+    particles[i] = make_float4(x, y, age, vmag);
+}
+"""
+
+# GLSL shaders -- draw GL_POINTS from the VBO. Position maps [0,1] -> clip space;
+# color ramps blue -> cyan -> white by speed and fades with age. The fragment
+# shader makes each point a soft round sprite for the glow.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;   // x, y in [0, 1]
+in vec2 attribs;    // age, speed
+out float v_age;
+out float v_speed;
+uniform float max_age;
+uniform float point_size;
+void main() {
+    gl_Position = vec4(position * 2.0 - 1.0, 0.0, 1.0);
+    v_age = clamp(attribs.x / max_age, 0.0, 1.0);
+    v_speed = attribs.y;
+    // Subtle size-by-speed: fast jets render a touch larger so filaments read
+    // as brighter, structured streaks. Reuses the existing speed attribute --
+    // no struct change. Calm points keep the base size; never shrinks below it.
+    gl_PointSize = point_size * (1.0 + 0.3 * clamp(v_speed, 0.0, 1.0));
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in float v_age;
+in float v_speed;
+out vec4 fragColor;
+void main() {
+    // Soft round sprite: fade toward the edge of the point.
+    vec2 d = gl_PointCoord - vec2(0.5);
+    float r = length(d) * 2.0;
+    float falloff = clamp(1.0 - r, 0.0, 1.0);
+    falloff *= falloff;
+
+    // Speed ramp: blue -> cyan -> white. v_speed is the normalized flow
+    // magnitude in [0, 1] (see advance_particles), so it spans the ramp.
+    float s = clamp(v_speed, 0.0, 1.0);
+    vec3 cool = vec3(0.12, 0.40, 1.00);   // lifted enough that slow points still glow
+    vec3 mid  = vec3(0.22, 0.85, 1.15);
+    vec3 hot  = vec3(1.15, 1.15, 1.20);   // slightly >1 so only the densest cores clip
+    vec3 color = (s < 0.5)
+        ? mix(cool, mid, s * 2.0)
+        : mix(mid, hot, (s - 0.5) * 2.0);
+
+    // Fade in just after spawn and out near end of life.
+    float life = (1.0 - v_age) * smoothstep(0.0, 0.08, v_age);
+    float alpha = falloff * life * 0.7;   // density carries the glow; trim so cores don't fully clip
+
+    fragColor = vec4(color, alpha);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_physarum.py b/cuda_core/examples/gl_interop_physarum.py
new file mode 100644
index 00000000000..99972635b14
--- /dev/null
+++ b/cuda_core/examples/gl_interop_physarum.py
@@ -0,0 +1,889 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# together with a plain device Buffer and GraphicsResource for CUDA/OpenGL
+# interop. A large population of "slime mold" (Physarum) agents crawls over a
+# single-channel float trail map: each agent senses the trail ahead via a
+# TextureObject (LINEAR + WRAP sampling), steers toward the strongest scent,
+# steps forward, and deposits pheromone through a SurfaceObject. A separate
+# diffuse/decay pass blurs and fades the trail (ping-ponged between two CUDA
+# arrays), and a colorize pass writes a neon palette straight into an OpenGL
+# PBO. The result is emergent, self-organizing vein/network patterns. Requires
+# pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to combine a plain device Buffer (per-agent state) with CUDAArray-backed
+#   TextureObject/SurfaceObject pairs in a single simulation, all on the GPU.
+# - How to allocate a single-channel float CUDAArray with
+#   `is_surface_load_store=True` so the same memory can be read as a
+#   TextureObject (LINEAR + WRAP + normalized) and written as a SurfaceObject.
+# - How to initialize a device Buffer from host data without a third-party array
+#   library: stage through a host-accessible pinned Buffer, fill it via NumPy,
+#   then `copy_from` into the device Buffer.
+#
+# How it works
+# ============
+# Physarum is an agent-based transport-network model. Every agent stores
+# (x, y, heading) and, once per frame:
+#
+#   1. Samples the trail at three sensors (left / center / right of its heading,
+#      a fixed sensor distance ahead) using tex2D<float> LINEAR sampling.
+#   2. Rotates toward whichever sensor reads strongest (with a little random
+#      jitter from a per-agent xorshift RNG seeded by index + frame).
+#   3. Steps forward by a fixed speed and wraps around the toroidal edges.
+#   4. Deposits a constant amount of pheromone into the trail via surf2Dwrite.
+#      Concurrent agents may race on the same texel -- that is acceptable and
+#      even characteristic of the model.
+#
+# Then two grid-parallel passes finish the frame:
+#
+#   diffuse_decay : box-blur the trail (tex2D LINEAR neighbor taps) and multiply
+#                   by a decay factor < 1. Reads the current array, writes the
+#                   other, then we swap (ping-pong).
+#   colorize      : color the trail by local gradient direction (hue) modulated
+#                   by intensity, with a ridge boost + bloom halo, into the PBO.
+#
+#   PING-PONG (two single-channel float arrays)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   move_agents reads + deposits into the CURRENT array (tex + surf of same arr).
+#   diffuse_decay reads CURRENT (tex) -> writes OTHER (surf) -> swap.
+#   colorize reads the new CURRENT (tex) -> OpenGL PBO.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Addressing modes WRAP and MIRROR are only supported with normalized
+# coordinates. WRAP makes the world a torus so agents and diffusion seamlessly
+# cross the edges. LINEAR filtering is essentially free on the hardware and
+# gives the agents smooth sub-texel gradient sensing. We sample at texel centers
+# `(x + 0.5) / W` so neighbor offsets land on integer texel positions.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not elements. The trail is a
+# single-channel `float` surface, so the x offset is `x * sizeof(float)` = `x*4`.
+# (Contrast a `float2` surface, which would need `x*8`.) Getting this wrong
+# silently corrupts every Nth column.
+#
+# Per-agent state lives in a plain device Buffer
+# ----------------------------------------------
+# Agents are stored as a flat float32 array of length 3*N laid out as
+# [x0, y0, h0, x1, y1, h1, ...]. We allocate it once with `dev.allocate` and
+# pass the Buffer object straight to `launch` (matching saxpy.py / memory_ops.py,
+# which pass Buffer objects directly rather than a raw pointer int).
+#
+# What you should see
+# ===================
+# A window of glowing neon filaments that grow, branch, and reorganize into a
+# living transport network. Press 1/2/3 to switch behavior presets (different
+# sensor geometry and turn speed give different morphologies), R to reseed the
+# agents and clear the trail, and Escape to exit. The title shows the preset,
+# agent count, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 1024
+N_AGENTS = 1 << 21  # ~2.1 million agents
+DEPOSIT = 0.2  # pheromone added to the trail per agent per frame (small so the
+#              additive deposit accumulates meaningfully instead of instantly
+#              saturating the field to 1.0)
+
+# Named presets: (sensor_angle_rad, sensor_distance_px, turn_speed_rad, move_speed_px, decay, label).
+# Different sensor geometry / turn speeds yield strikingly different networks.
+PRESETS = {
+    "1": (0.40, 9.0, 0.40, 1.0, 0.92, "veins"),
+    "2": (0.80, 16.0, 0.25, 1.0, 0.90, "webs"),
+    "3": (1.20, 5.0, 0.65, 1.5, 0.95, "swarm"),
+}
+DEFAULT_PRESET = "1"
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject/Buffer, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs. The
+    move pass is 1D over agents; the diffuse/colorize passes are 2D over pixels.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("move_agents", "diffuse_decay", "colorize"),
+    )
+
+    kernels = {
+        "move": mod.get_kernel("move_agents"),
+        "diffuse": mod.get_kernel("diffuse_decay"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    # 1D launch over agents.
+    move_block = (256, 1, 1)
+    move_grid = ((N_AGENTS + move_block[0] - 1) // move_block[0], 1, 1)
+    move_config = LaunchConfig(grid=move_grid, block=move_block)
+
+    # 2D launch over pixels (shared by diffuse and colorize).
+    px_block = (16, 16, 1)
+    px_grid = (
+        (WIDTH + px_block[0] - 1) // px_block[0],
+        (HEIGHT + px_block[1] - 1) // px_block[1],
+        1,
+    )
+    px_config = LaunchConfig(grid=px_grid, block=px_block)
+
+    configs = {"move": move_config, "diffuse": px_config, "colorize": px_config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface/Buffer - Physarum",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_trail_arrays():
+    """Allocate the two single-channel float ping-pong arrays for the trail map."""
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def fill_agent_host(host_view, seed):
+    """Fill a host-side float32 view (length 3*N) with random agents.
+
+    Layout is [x0, y0, h0, x1, y1, h1, ...]: position in [0, W)x[0, H) and
+    heading in [0, 2*pi).
+    """
+    rng = np.random.default_rng(seed)
+    agents = host_view.reshape(N_AGENTS, 3)
+    agents[:, 0] = rng.uniform(0.0, WIDTH, size=N_AGENTS)
+    agents[:, 1] = rng.uniform(0.0, HEIGHT, size=N_AGENTS)
+    agents[:, 2] = rng.uniform(0.0, 2.0 * np.pi, size=N_AGENTS)
+
+
+def reseed_agents(stream, device_agents, pinned_agents, host_view, seed):
+    """Refill the host staging view and copy it into the device agent Buffer.
+
+    Reuses the already-allocated device and pinned buffers -- no reallocation.
+    """
+    fill_agent_host(host_view, seed)
+    device_agents.copy_from(pinned_agents, stream=stream)
+
+
+def clear_trail(stream, arr_a, arr_b, zeros):
+    """Zero both trail arrays. CUDAArray.copy_from accepts a buffer-protocol host
+    object directly (unlike Buffer.copy_from), so a NumPy zero array works."""
+    arr_a.copy_from(zeros, stream=stream)
+    arr_b.copy_from(zeros, stream=stream)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. CUDA writes into it, OpenGL
+    #     reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong trail Arrays ---
+    #     Single-channel float with is_surface_load_store=True so they can be
+    #     bound as SurfaceObjects.
+    #
+    #   API MAP -- the four cuda.core objects that drive this simulation:
+    #     * device Buffer (dev.allocate) holds raw agent state alongside the
+    #       array/texture/surface stack.
+    #     * TextureObject LINEAR+WRAP+normalized -> smooth, toroidal SENSE of the
+    #       pheromone field.
+    #     * SurfaceObject -> typed DEPOSIT writes into the same CUDAArray sensed
+    #       as a texture (is_surface_load_store=True).
+    arr_a, arr_b = make_trail_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles (once, kept alive) ---
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Allocate per-agent state in a plain device Buffer ---
+    #     Flat float32 [x, y, heading] * N. We stage host data through a
+    #     host-accessible pinned Buffer, then copy it into the device Buffer.
+    #     Both buffers are allocated once and reused on reseed.
+    agent_floats = 3 * N_AGENTS
+    agent_bytes = agent_floats * 4
+    device_agents = dev.allocate(agent_bytes, stream=stream)
+    pinned_mr = LegacyPinnedMemoryResource()
+    pinned_agents = pinned_mr.allocate(agent_bytes)
+    host_view = np.from_dlpack(pinned_agents).view(np.float32)
+
+    # Host-side zero image reused to clear the trail arrays.
+    zeros = np.zeros((WIDTH, HEIGHT), dtype=np.float32)
+
+    # --- Step 9: Seed initial agents + clear the trail ---
+    state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0, "frame": 0}
+    reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"])
+    clear_trail(stream, arr_a, arr_b, zeros)
+    stream.sync()  # ensure the seed copy finishes before the first launch reads it
+
+    # --- Step 10: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_tex_surf():
+        """Return (tex, surf) for the CURRENT trail array (read + deposit)."""
+        if state["current"] == "a":
+            return tex_a, surf_a
+        return tex_b, surf_b
+
+    def diffuse_read_write():
+        """Return (tex_read_current, surf_write_other, next_current)."""
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            state["frame"] = 0
+            reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"])
+            clear_trail(stream, arr_a, arr_b, zeros)
+            state["current"] = "a"
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        sensor_angle, sensor_dist, turn_speed, move_speed, decay, _label = PRESETS[state["preset"]]
+
+        # (a) Move + deposit: 1D over agents. Reads and deposits into the
+        #     CURRENT array (tex + surf of the same array).
+        tex_cur, surf_cur = current_tex_surf()
+        launch(
+            stream,
+            configs["move"],
+            kernels["move"],
+            device_agents,
+            np.int32(N_AGENTS),
+            np.uint64(tex_cur.handle),
+            np.uint64(surf_cur.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(sensor_angle),
+            np.float32(sensor_dist),
+            np.float32(turn_speed),
+            np.float32(move_speed),
+            np.float32(DEPOSIT),
+            np.uint32(state["frame"]),
+        )
+
+        # (b) Diffuse + decay: 2D over pixels. Reads CURRENT, writes OTHER, swap.
+        tex_read, surf_write, next_current = diffuse_read_write()
+        launch(
+            stream,
+            configs["diffuse"],
+            kernels["diffuse"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(decay),
+        )
+        state["current"] = next_current
+
+        # (c) Colorize the latest trail into the OpenGL PBO.
+        tex_show = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_show.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (d) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (e) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        state["frame"] += 1
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][5]
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface/Buffer - Physarum"
+                f" [{label}] ({WIDTH}x{HEIGHT}, {N_AGENTS:,} agents, {fps:.0f} FPS)"
+                " | Buffer(agents) + TextureObject[LINEAR|WRAP|norm] sense"
+                " + SurfaceObject deposit"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        pinned_agents.close()
+        device_agents.close(stream)
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above.
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * move_agents   -- 1 thread per agent: senses the trail at three points
+#                          via tex2D<float> (LINEAR + WRAP), rotates toward the
+#                          strongest, steps forward with toroidal wrap, and
+#                          deposits pheromone via surf2Dwrite (x offset in BYTES).
+#       * diffuse_decay -- box-blur the trail via tex2D LINEAR neighbor taps and
+#                          multiply by a decay factor < 1; ping-pong write.
+#       * colorize      -- color the trail by the local gradient DIRECTION (hue
+#                          via HSV) modulated by intensity, with a ridge boost
+#                          and a wider-tap bloom halo for glowing veins, into
+#                          RGBA bytes in the PBO.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw the
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Per-agent xorshift32 RNG: cheap, good enough for turn jitter. Seeded per
+// agent and per frame so the sequence differs every step.
+__device__ __forceinline__ unsigned int xorshift32(unsigned int s) {
+    s ^= s << 13;
+    s ^= s >> 17;
+    s ^= s << 5;
+    return s;
+}
+
+extern "C"
+__global__
+void move_agents(float* agents,
+                 int n_agents,
+                 cudaTextureObject_t tex,
+                 cudaSurfaceObject_t surf,
+                 int width, int height,
+                 float sensor_angle,
+                 float sensor_dist,
+                 float turn_speed,
+                 float move_speed,
+                 float deposit,
+                 unsigned int frame) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_agents) return;
+
+    int base = i * 3;
+    float x = agents[base + 0];
+    float y = agents[base + 1];
+    float heading = agents[base + 2];
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+
+    // Sample the trail at center / left / right of the heading. Normalized
+    // coords (+0.5 texel center) are required for WRAP addressing.
+    float ca = heading;
+    float la = heading - sensor_angle;
+    float ra = heading + sensor_angle;
+
+    float cx = x + cosf(ca) * sensor_dist;
+    float cy = y + sinf(ca) * sensor_dist;
+    float lx = x + cosf(la) * sensor_dist;
+    float ly = y + sinf(la) * sensor_dist;
+    float rx = x + cosf(ra) * sensor_dist;
+    float ry = y + sinf(ra) * sensor_dist;
+
+    float sc = tex2D<float>(tex, (cx + 0.5f) * inv_w, (cy + 0.5f) * inv_h);
+    float sl = tex2D<float>(tex, (lx + 0.5f) * inv_w, (ly + 0.5f) * inv_h);
+    float sr = tex2D<float>(tex, (rx + 0.5f) * inv_w, (ry + 0.5f) * inv_h);
+
+    // Per-agent jitter in [0, 1).
+    unsigned int rng = xorshift32(((unsigned int)i + 1u) * 2654435761u + frame * 40503u);
+    float jitter = (rng & 0xffffffu) / (float)0x1000000;
+
+    // Steer toward the strongest sensor; random turn when ahead is ambiguous.
+    if (sc > sl && sc > sr) {
+        // keep going straight
+    } else if (sc < sl && sc < sr) {
+        // both sides better than center: turn randomly left or right
+        heading += (jitter < 0.5f ? -turn_speed : turn_speed);
+    } else if (sl > sr) {
+        heading -= turn_speed;
+    } else if (sr > sl) {
+        heading += turn_speed;
+    } else {
+        // tie: small random wiggle
+        heading += (jitter - 0.5f) * turn_speed;
+    }
+
+    // Step forward and wrap around the toroidal world.
+    x += cosf(heading) * move_speed;
+    y += sinf(heading) * move_speed;
+
+    float fw = (float)width;
+    float fh = (float)height;
+    if (x < 0.0f) x += fw;
+    if (x >= fw) x -= fw;
+    if (y < 0.0f) y += fh;
+    if (y >= fh) y -= fh;
+
+    agents[base + 0] = x;
+    agents[base + 1] = y;
+    agents[base + 2] = heading;
+
+    // Deposit pheromone at the new integer cell. surf2Dwrite x offset is in
+    // BYTES: single-channel float => x * sizeof(float). Concurrent agents may
+    // race on the same texel; that is acceptable for Physarum.
+    int ix = (int)x;
+    int iy = (int)y;
+    if (ix < 0) ix = 0; else if (ix >= width) ix = width - 1;
+    if (iy < 0) iy = 0; else if (iy >= height) iy = height - 1;
+
+    float prev = surf2Dread<float>(surf, ix * (int)sizeof(float), iy);
+    float val = prev + deposit;
+    if (val > 1.0f) val = 1.0f;
+    surf2Dwrite(val, surf, ix * (int)sizeof(float), iy);
+}
+
+extern "C"
+__global__
+void diffuse_decay(cudaTextureObject_t tex,
+                   cudaSurfaceObject_t surf,
+                   int width, int height,
+                   float decay) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    // 3x3 box blur via LINEAR neighbor taps; WRAP gives toroidal edges.
+    float sum = 0.0f;
+    for (int dy = -1; dy <= 1; ++dy) {
+        for (int dx = -1; dx <= 1; ++dx) {
+            sum += tex2D<float>(tex, cx + dx * inv_w, cy + dy * inv_h);
+        }
+    }
+    float blurred = sum * (1.0f / 9.0f);
+
+    float out = blurred * decay;
+    if (out < 0.0f) out = 0.0f;
+    if (out > 1.0f) out = 1.0f;
+
+    surf2Dwrite(out, surf, x * (int)sizeof(float), y);
+}
+
+// HSV -> RGB (all components in [0, 1]). Standard six-sector conversion; used
+// by colorize to turn the local trail-gradient direction into a hue.
+__device__ __forceinline__ void hsv2rgb(float h, float s, float v,
+                                        float* r, float* g, float* b) {
+    h -= floorf(h);          // wrap hue into [0, 1)
+    float hp = h * 6.0f;
+    int sector = (int)hp;
+    float f = hp - (float)sector;
+    float p = v * (1.0f - s);
+    float q = v * (1.0f - s * f);
+    float t = v * (1.0f - s * (1.0f - f));
+    switch (sector % 6) {
+        case 0:  *r = v; *g = t; *b = p; break;
+        case 1:  *r = q; *g = v; *b = p; break;
+        case 2:  *r = p; *g = v; *b = t; break;
+        case 3:  *r = p; *g = q; *b = v; break;
+        case 4:  *r = t; *g = p; *b = v; break;
+        default: *r = v; *g = p; *b = q; break;
+    }
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    float v = tex2D<float>(tex, cx, cy);
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Local trail gradient from LINEAR+WRAP neighbor taps (toroidal, no edge
+    // special-casing). Its direction sets the HUE so the network is colored by
+    // the orientation of the veins instead of a single intensity ramp.
+    float l = tex2D<float>(tex, cx - inv_w, cy);
+    float rgt = tex2D<float>(tex, cx + inv_w, cy);
+    float dn = tex2D<float>(tex, cx, cy - inv_h);
+    float up = tex2D<float>(tex, cx, cy + inv_h);
+    float gx = rgt - l;
+    float gy = up - dn;
+    float hue = atan2f(gy, gx) * (0.1591549f) + 0.5f;  // atan2/(2*pi) + 0.5 -> [0,1)
+
+    // Soft glow/bloom: a wider ring of taps lifts a luminous halo around the
+    // veins so they read as glowing rather than flat. Still WRAP-sampled.
+    float bloom = 0.0f;
+    bloom += tex2D<float>(tex, cx - 2.0f * inv_w, cy);
+    bloom += tex2D<float>(tex, cx + 2.0f * inv_w, cy);
+    bloom += tex2D<float>(tex, cx, cy - 2.0f * inv_h);
+    bloom += tex2D<float>(tex, cx, cy + 2.0f * inv_h);
+    bloom += l + rgt + dn + up;
+    bloom *= 0.125f;  // average of the 8 surrounding taps
+
+    // Intensity stays the dominant brightness driver so the reticular structure
+    // survives; gradient magnitude sharpens ridges into bright luminous veins.
+    float grad_mag = sqrtf(gx * gx + gy * gy);
+    float ridge = grad_mag * 6.0f;
+    if (ridge > 1.0f) ridge = 1.0f;
+
+    // Saturation eases toward white on the brightest ridges (neon -> white-hot).
+    float sat = 1.0f - 0.45f * v;
+
+    // Brightness: core intensity (gamma-lifted) + ridge boost + bloom halo.
+    float val = sqrtf(v) + 0.55f * ridge + 0.45f * bloom;
+    if (val > 1.0f) val = 1.0f;
+
+    float r, g, b;
+    hsv2rgb(hue, sat, val, &r, &g, &b);
+
+    // Lift the floor toward a deep blue-violet so empty space is not pure black,
+    // giving the glow something to bleed into.
+    r += 0.02f;
+    g += 0.0f;
+    b += 0.06f;
+    if (r > 1.0f) r = 1.0f;
+    if (g > 1.0f) g = 1.0f;
+    if (b > 1.0f) b = 1.0f;
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index d385c26dd6c..75910b87894 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -83,12 +83,19 @@ def has_recent_memory_pool_support() -> bool:
 SYSTEM_REQUIREMENTS = {
     "memory_pool_resources.py": has_recent_memory_pool_support,
     "gl_interop_plasma.py": has_display,
+    "gl_interop_bloom.py": has_display,
+    "gl_interop_caustics.py": has_display,
+    "gl_interop_clouds.py": has_display,
     "gl_interop_fire.py": has_display,
+    "gl_interop_fluid.py": has_display,
     "gl_interop_image_show.py": has_display,
+    "gl_interop_jfa_voronoi.py": has_display,
     "gl_interop_lenia.py": has_display,
     "gl_interop_mandelbrot.py": has_display,
     "gl_interop_mipmap_lod.py": has_display,
     "gl_interop_ocean.py": has_display,
+    "gl_interop_particles.py": has_display,
+    "gl_interop_physarum.py": has_display,
     "gl_interop_reaction_diffusion.py": has_display,
     "gl_interop_sdf_volume.py": has_display,
     "gl_interop_texture_filter.py": has_display,

From 088115b886ca8f8920899fecf1ceda41109e0843 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 11 Jun 2026 14:57:47 -0700
Subject: [PATCH 15/17] cuda.core: dedup texture/array validation; fix
 docstring + address_mode type

- Extract shared _validate_format_channels / _validate_array_shape helpers in
  _array.pyx; adopt them in CUDAArray, MipmappedArray, and the texture
  from_linear/from_pitch2d factories (removes 4x num_channels, 4x format, and
  2x shape duplicate validators).
- ResourceDescriptor docstring now lists from_mipmapped_array (was 3 of 4
  factories).
- TextureDescriptor.address_mode annotated AddressMode | tuple[AddressMode, ...]
  instead of object (CLAUDE.md: avoid Any).
- Regenerate .pyi stubs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_array.pyi           | 10 +++++-
 cuda_core/cuda/core/_array.pyx           | 40 +++++++++++++++---------
 cuda_core/cuda/core/_mipmapped_array.pyx | 18 ++---------
 cuda_core/cuda/core/_texture.pyi         |  4 ++-
 cuda_core/cuda/core/_texture.pyx         | 16 ++++------
 5 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/cuda_core/cuda/core/_array.pyi b/cuda_core/cuda/core/_array.pyi
index 7dcf4ad5a0a..61ec023a4b0 100644
--- a/cuda_core/cuda/core/_array.pyi
+++ b/cuda_core/cuda/core/_array.pyi
@@ -145,4 +145,12 @@ class CUDAArray:
 
     def __repr__(self):
         ...
-_FORMAT_ELEM_SIZE = {int(ArrayFormat.UINT8): 1, int(ArrayFormat.INT8): 1, int(ArrayFormat.UINT16): 2, int(ArrayFormat.INT16): 2, int(ArrayFormat.FLOAT16): 2, int(ArrayFormat.UINT32): 4, int(ArrayFormat.INT32): 4, int(ArrayFormat.FLOAT32): 4}
\ No newline at end of file
+_FORMAT_ELEM_SIZE = {int(ArrayFormat.UINT8): 1, int(ArrayFormat.INT8): 1, int(ArrayFormat.UINT16): 2, int(ArrayFormat.INT16): 2, int(ArrayFormat.FLOAT16): 2, int(ArrayFormat.UINT32): 4, int(ArrayFormat.INT32): 4, int(ArrayFormat.FLOAT32): 4}
+
+def _validate_format_channels(format, num_channels):
+    """Validate the ``(format, num_channels)`` pair shared by the array,
+    mipmap, and texture factories. Raises on an invalid combination."""
+
+def _validate_array_shape(shape):
+    """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that
+    every extent is >= 1. Returns the normalized tuple."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index 851f8cb9bf0..a669962572a 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -48,6 +48,30 @@ _FORMAT_ELEM_SIZE = {
 }
 
 
+def _validate_format_channels(format, num_channels):
+    """Validate the ``(format, num_channels)`` pair shared by the array,
+    mipmap, and texture factories. Raises on an invalid combination."""
+    if not isinstance(format, ArrayFormat):
+        raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+    if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+        raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+
+def _validate_array_shape(shape):
+    """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that
+    every extent is >= 1. Returns the normalized tuple."""
+    try:
+        shape_t = tuple(int(s) for s in shape)
+    except TypeError as e:
+        raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
+    if not 1 <= len(shape_t) <= 3:
+        raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+    for i, dim in enumerate(shape_t):
+        if dim < 1:
+            raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+    return shape_t
+
+
 cdef void _fill_array_endpoint(
     cydriver.CUDA_MEMCPY3D* p, CUDAArray arr, bint is_src
 ) noexcept:
@@ -238,20 +262,8 @@ cdef class CUDAArray:
         -------
         CUDAArray
         """
-        if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
-
-        try:
-            shape_t = tuple(int(s) for s in shape)
-        except TypeError as e:
-            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
-        if not 1 <= len(shape_t) <= 3:
-            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
-        for i, dim in enumerate(shape_t):
-            if dim < 1:
-                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+        _validate_format_channels(format, num_channels)
+        shape_t = _validate_array_shape(shape)
 
         cdef CUDAArray self = cls.__new__(cls)
         self._owning = True
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
index cbf6d70732c..f3108852d75 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyx
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -9,7 +9,7 @@ from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
 from cuda.core._array cimport CUDAArray
-from cuda.core._array import ArrayFormat
+from cuda.core._array import ArrayFormat, _validate_array_shape, _validate_format_channels
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
     _get_current_context_ptr,
@@ -64,20 +64,8 @@ cdef class MipmappedArray:
         -------
         MipmappedArray
         """
-        if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
-
-        try:
-            shape_t = tuple(int(s) for s in shape)
-        except TypeError as e:
-            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
-        if not 1 <= len(shape_t) <= 3:
-            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
-        for i, dim in enumerate(shape_t):
-            if dim < 1:
-                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+        _validate_format_channels(format, num_channels)
+        shape_t = _validate_array_shape(shape)
 
         levels = int(num_levels)
         if levels < 1:
diff --git a/cuda_core/cuda/core/_texture.pyi b/cuda_core/cuda/core/_texture.pyi
index 4f8543a00d0..132a40273c3 100644
--- a/cuda_core/cuda/core/_texture.pyi
+++ b/cuda_core/cuda/core/_texture.pyi
@@ -39,6 +39,8 @@ class ResourceDescriptor:
 
     - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
       :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped
+      sampling (texture only, not surface).
     - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
       objects built from a linear resource do not support filtering,
       normalized coordinates, or addressing modes.
@@ -181,7 +183,7 @@ class TextureDescriptor:
         4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
         zero.
     """
-    address_mode: object = AddressMode.CLAMP
+    address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP
     filter_mode: FilterMode = FilterMode.POINT
     read_mode: ReadMode = ReadMode.ELEMENT_TYPE
     normalized_coords: bool = False
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 7b1e7301c98..1c24f694145 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -9,7 +9,7 @@ from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
 from cuda.core._array cimport CUDAArray
-from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
+from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE, _validate_format_channels
 from cuda.core._memory._buffer cimport Buffer
 from cuda.core._mipmapped_array cimport MipmappedArray
 from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
@@ -65,6 +65,8 @@ class ResourceDescriptor:
 
     - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
       :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped
+      sampling (texture only, not surface).
     - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
       objects built from a linear resource do not support filtering,
       normalized coordinates, or addressing modes.
@@ -154,10 +156,7 @@ class ResourceDescriptor:
         """
         if not isinstance(buffer, Buffer):
             raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
-        if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+        _validate_format_channels(format, num_channels)
 
         buf_size = int(buffer.size)
         elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
@@ -216,10 +215,7 @@ class ResourceDescriptor:
         """
         if not isinstance(buffer, Buffer):
             raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
-        if not isinstance(format, ArrayFormat):
-            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
-        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
-            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+        _validate_format_channels(format, num_channels)
 
         w = int(width)
         h = int(height)
@@ -337,7 +333,7 @@ class TextureDescriptor:
         zero.
     """
 
-    address_mode: object = AddressMode.CLAMP
+    address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP
     filter_mode: FilterMode = FilterMode.POINT
     read_mode: ReadMode = ReadMode.ELEMENT_TYPE
     normalized_coords: bool = False

From cf9441cc677c8f02234caed988f40d8ff4aa37d0 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 11 Jun 2026 15:06:50 -0700
Subject: [PATCH 16/17] cuda.core: remove dead _context field and orphaned
 helper

The _context field (raw intptr_t) was captured in CUDAArray, MipmappedArray,
SurfaceObject, and TextureObject constructors but never read. It could not be
safely used to gate destruction (it is not a refcounted context-handle ref like
Stream._h_context), so it was pure dead state plus an extra cuCtxGetCurrent per
construction.

- Drop the _context slot from all four .pxd files (and the now-unused intptr_t
  cimports).
- Drop the self._context assignments and the _get_current_context_ptr cimport
  from all four .pyx files.
- Remove the now-orphaned _get_current_context_ptr helper from cuda_utils
  (.pyx + .pxd); _tensor_map.pyx keeps its own local copy and is unaffected.
  _get_current_device_id stays (still used for the .device property).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_array.pxd            |  2 --
 cuda_core/cuda/core/_array.pyx            |  3 ---
 cuda_core/cuda/core/_mipmapped_array.pxd  |  2 --
 cuda_core/cuda/core/_mipmapped_array.pyx  |  2 --
 cuda_core/cuda/core/_surface.pxd          |  2 --
 cuda_core/cuda/core/_surface.pyx          |  2 --
 cuda_core/cuda/core/_texture.pxd          |  2 --
 cuda_core/cuda/core/_texture.pyx          |  2 --
 cuda_core/cuda/core/_utils/cuda_utils.pxd |  7 +++----
 cuda_core/cuda/core/_utils/cuda_utils.pyx | 13 -------------
 10 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
index 25069a81eb9..461204e7f56 100644
--- a/cuda_core/cuda/core/_array.pxd
+++ b/cuda_core/cuda/core/_array.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
 from cuda.bindings cimport cydriver
 
 
@@ -14,7 +13,6 @@ cdef class CUDAArray:
         cydriver.CUarray_format _format
         unsigned int _num_channels   # 1, 2, or 4
         int _device_id
-        intptr_t _context
         bint _owning
         bint _surface_load_store
         # Optional strong reference to a parent owner (e.g. a MipmappedArray
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
index a669962572a..66420ffc471 100644
--- a/cuda_core/cuda/core/_array.pyx
+++ b/cuda_core/cuda/core/_array.pyx
@@ -13,7 +13,6 @@ from cuda.core._memory._buffer cimport Buffer
 from cuda.core._stream cimport Stream
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
-    _get_current_context_ptr,
     _get_current_device_id,
 )
 
@@ -271,7 +270,6 @@ cdef class CUDAArray:
         self._format = <cydriver.CUarray_format><int>format
         self._num_channels = num_channels
         self._surface_load_store = bool(is_surface_load_store)
-        self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
         self._parent_ref = None
 
@@ -318,7 +316,6 @@ cdef class CUDAArray:
         cdef CUDAArray self = cls.__new__(cls)
         self._handle = <cydriver.CUarray><void*>handle
         self._owning = owning
-        self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id() if device_id is None else int(device_id)
         self._parent_ref = None
 
diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd
index 52afc1968cc..4feebd10c79 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pxd
+++ b/cuda_core/cuda/core/_mipmapped_array.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
 from cuda.bindings cimport cydriver
 
 
@@ -15,7 +14,6 @@ cdef class MipmappedArray:
         unsigned int _num_channels   # 1, 2, or 4
         unsigned int _num_levels
         int _device_id
-        intptr_t _context
         bint _owning
         bint _surface_load_store
 
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
index f3108852d75..a7ecd29b9d1 100644
--- a/cuda_core/cuda/core/_mipmapped_array.pyx
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -12,7 +12,6 @@ from cuda.core._array cimport CUDAArray
 from cuda.core._array import ArrayFormat, _validate_array_shape, _validate_format_channels
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
-    _get_current_context_ptr,
     _get_current_device_id,
 )
 
@@ -78,7 +77,6 @@ cdef class MipmappedArray:
         self._num_channels = num_channels
         self._num_levels = <unsigned int>levels
         self._surface_load_store = bool(is_surface_load_store)
-        self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
         cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd
index dd8548e0a36..13a075eb4a3 100644
--- a/cuda_core/cuda/core/_surface.pxd
+++ b/cuda_core/cuda/core/_surface.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
 from cuda.bindings cimport cydriver
 
 
@@ -12,6 +11,5 @@ cdef class SurfaceObject:
         cydriver.CUsurfObject _handle
         object _source_ref      # keep backing CUDAArray alive
         int _device_id
-        intptr_t _context
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
index 383f99f0218..87e80e99ef0 100644
--- a/cuda_core/cuda/core/_surface.pyx
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -12,7 +12,6 @@ from cuda.core._array cimport CUDAArray
 from cuda.core._texture import ResourceDescriptor
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
-    _get_current_context_ptr,
     _get_current_device_id,
 )
 
@@ -84,7 +83,6 @@ cdef class SurfaceObject:
 
         cdef SurfaceObject self = cls.__new__(cls)
         self._source_ref = resource
-        self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
         with nogil:
diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd
index 40725cfe40d..5a1fd84b9ad 100644
--- a/cuda_core/cuda/core/_texture.pxd
+++ b/cuda_core/cuda/core/_texture.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
 from cuda.bindings cimport cydriver
 
 
@@ -13,6 +12,5 @@ cdef class TextureObject:
         object _source_ref      # keep backing CUDAArray (or other resource) alive
         object _texture_desc    # original TextureDescriptor for introspection
         int _device_id
-        intptr_t _context
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
index 1c24f694145..6ea8ad805ad 100644
--- a/cuda_core/cuda/core/_texture.pyx
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -15,7 +15,6 @@ from cuda.core._mipmapped_array cimport MipmappedArray
 from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
-    _get_current_context_ptr,
     _get_current_device_id,
 )
 
@@ -514,7 +513,6 @@ cdef class TextureObject:
         cdef TextureObject self = cls.__new__(cls)
         self._source_ref = resource
         self._texture_desc = texture_descriptor
-        self._context = _get_current_context_ptr()
         self._device_id = _get_current_device_id()
 
         with nogil:
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index a8115aaf3f9..11e464e6381 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -4,7 +4,7 @@
 
 cimport cpython
 from cpython.object cimport PyObject
-from libc.stdint cimport int64_t, int32_t, intptr_t, uint8_t, uint16_t, uint32_t
+from libc.stdint cimport int64_t, int32_t, uint8_t, uint16_t, uint32_t
 
 from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink
 
@@ -25,9 +25,8 @@ cdef int HANDLE_RETURN_NVJITLINK(
     cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil
 
 
-# Helpers for retrieving the current CUDA context and device. Raise if no
-# active context is bound to the calling thread.
-cdef intptr_t _get_current_context_ptr() except? 0
+# Helper for retrieving the current CUDA device. Raises if no active context
+# is bound to the calling thread.
 cdef int _get_current_device_id() except? -1
 
 
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index f39f9a252b5..318d4466bee 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -69,19 +69,6 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil:
     return 0
 
 
-cdef intptr_t _get_current_context_ptr() except? 0:
-    """Return the current thread's bound CUcontext as an intptr_t.
-
-    Raises ``RuntimeError`` if no context is current.
-    """
-    cdef cydriver.CUcontext ctx
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        raise RuntimeError("an active CUDA context is required")
-    return <intptr_t>ctx
-
-
 cdef int _get_current_device_id() except? -1:
     """Return the current thread's bound CUdevice ordinal."""
     cdef cydriver.CUdevice dev

From d8c2db6369b275f178fc61c87d10d58a5f68ad7d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 11 Jun 2026 15:27:22 -0700
Subject: [PATCH 17/17] caustics improvements

---
 cuda_core/examples/gl_interop_caustics.py | 217 ++++++++++------------
 1 file changed, 101 insertions(+), 116 deletions(-)

diff --git a/cuda_core/examples/gl_interop_caustics.py b/cuda_core/examples/gl_interop_caustics.py
index 35de14394d7..5fe57e256f0 100644
--- a/cuda_core/examples/gl_interop_caustics.py
+++ b/cuda_core/examples/gl_interop_caustics.py
@@ -5,13 +5,13 @@
 # ################################################################################
 #
 # This example demonstrates cuda.core.CUDAArray, TextureObject, and
-# GraphicsResource for CUDA/OpenGL interop. A vivid procedural background image
-# is uploaded once into a 2D CUDAArray and bound as a TextureObject sampled with
+# GraphicsResource for CUDA/OpenGL interop. A tiled pool-floor image is uploaded
+# once into a 2D CUDAArray and bound as a TextureObject sampled with
 # FilterMode.LINEAR + AddressMode.MIRROR + normalized coordinates. Each frame a
 # `render_water` kernel evaluates an animated water surface analytically, refracts
-# the view ray through it to perturb the background lookup UVs, adds shimmering
-# caustic highlights, and writes RGBA8 straight into an OpenGL PBO. The effect is
-# "looking through a sunlit pool". Requires pyglet.
+# the background lookup UVs through it, and overlays a bright caustic network
+# computed from where the refraction focuses, writing RGBA8 straight into an
+# OpenGL PBO. The effect is "looking down at a sunlit pool". Requires pyglet.
 #
 # ################################################################################
 
@@ -58,10 +58,10 @@
 #        surface gradient) -- a cheap 2D approximation of bending the view ray.
 #     3. Sample the background TextureObject at the perturbed UV (LINEAR +
 #        MIRROR keeps it smooth and well-defined outside [0, 1]).
-#     4. Caustics: brightness focuses where wavefronts converge. Approximate
-#        with a sharpened power of the surface curvature (Laplacian), adding
-#        bright cyan/white highlights. Add a depth tint (deeper = bluer) and a
-#        specular sparkle from the normal versus a fixed light direction.
+#     4. Caustics: the refraction map (u,v)->(su,sv) focuses light where its
+#        Jacobian determinant approaches zero. We light a thin band around that
+#        det->0 curve to draw the bright, interconnected caustic network, then
+#        add a depth tint (deeper = bluer) and faint specular glints.
 #     5. Tonemap and write RGBA8 into the OpenGL PBO. No PCIe traffic per frame.
 #
 # Why MIRROR (not WRAP or CLAMP)?
@@ -74,11 +74,11 @@
 #
 # What you should see
 # ===================
-# A colorful tiled background rippling as if seen through moving water, with
-# bright caustic highlights skittering across it. Press +/- to change the
-# refraction/ripple strength, click anywhere to spawn an expanding circular
+# A tiled aqua pool floor seen through gently moving water, overlaid with a
+# bright, shifting network of caustic light filaments. Press +/- to change the
+# water/refraction strength, click anywhere to spawn an expanding circular
 # ripple at the cursor, and Escape to exit. The title shows FPS and the current
-# ripple strength.
+# strength.
 #
 
 # /// script
@@ -138,47 +138,42 @@
 
 
 def make_background_image(size):
-    """Build a (size, size, 4) uint8 RGBA background designed to show refraction.
+    """Build a (size, size, 4) uint8 RGBA swimming-pool floor: aqua tiles + grout.
 
     Layout convention: CUDAArray.from_descriptor takes shape=(WIDTH, HEIGHT), so
     the host buffer fed to copy_from must be H rows of W elements (row-major),
     i.e. host.shape == (HEIGHT, WIDTH, 4). Here the image is square so the two
     agree, but the (y, x) indexing below is the load-bearing part.
 
-    The pattern is deliberately vivid and high-frequency -- a grid of saturated
-    hues with concentric rings -- so even small refraction offsets are obvious.
+    A calm tiled pool floor (low-saturation aqua tiles with slightly darker
+    grout and gentle per-tile variation) is the right backdrop for caustics: it
+    gives the refraction something legible to warp without itself looking busy,
+    so the bright caustic network drawn on top reads as light on water rather
+    than a clash of colors.
     """
     ys, xs = np.mgrid[0:size, 0:size].astype(np.float32)
     u = xs / size
     v = ys / size
 
-    # Saturated, smoothly varying hues across the plane (a cheap HSV-ish wheel).
-    r = 0.5 + 0.5 * np.sin(u * 6.2831853 * 2.0 + 0.0)
-    g = 0.5 + 0.5 * np.sin(v * 6.2831853 * 2.0 + 2.0944)
-    b = 0.5 + 0.5 * np.sin((u + v) * 6.2831853 * 2.0 + 4.1888)
-
-    # Bright grid lines so the warp is legible.
-    cells = 8.0
-    gx = np.abs(((u * cells) % 1.0) - 0.5)
-    gy = np.abs(((v * cells) % 1.0) - 0.5)
-    grid = np.maximum(gx, gy)
-    grid_line = (grid > 0.42).astype(np.float32)
-    r = r * (1.0 - grid_line) + 1.0 * grid_line
-    g = g * (1.0 - grid_line) + 1.0 * grid_line
-    b = b * (1.0 - grid_line) + 1.0 * grid_line
-
-    # A couple of concentric rings centered on the image to add curvature cues.
-    cx, cy = 0.5, 0.5
-    dist = np.sqrt((u - cx) ** 2 + (v - cy) ** 2)
-    rings = 0.5 + 0.5 * np.sin(dist * 6.2831853 * 10.0)
-    r = np.clip(r * 0.75 + rings * 0.25, 0.0, 1.0)
-    g = np.clip(g * 0.75 + rings * 0.20, 0.0, 1.0)
-    b = np.clip(b * 0.85 + rings * 0.15, 0.0, 1.0)
-
+    cells = 6.0
+    # Distance from each tile's edge (0 at center, 1 at the grout line).
+    ex = np.abs(((u * cells) % 1.0) - 0.5) * 2.0
+    ey = np.abs(((v * cells) % 1.0) - 0.5) * 2.0
+    edge = np.maximum(ex, ey)
+    grout = np.clip((edge - 0.82) / 0.18, 0.0, 1.0)  # smooth grout band
+
+    # Subtle per-tile brightness variation (cheap hash on the tile index).
+    ti = np.floor(u * cells) + np.floor(v * cells) * 31.0
+    var = (np.sin(ti * 12.9898) * 43758.5453) % 1.0
+    shade = 0.92 + 0.08 * var
+
+    # Aqua tile body and a darker teal grout, blended by the grout band.
+    tile = np.array([0.30, 0.66, 0.74], dtype=np.float32)
+    mortar = np.array([0.12, 0.34, 0.42], dtype=np.float32)
     img = np.zeros((size, size, 4), dtype=np.uint8)
-    img[:, :, 0] = (r * 255.0).astype(np.uint8)
-    img[:, :, 1] = (g * 255.0).astype(np.uint8)
-    img[:, :, 2] = (b * 255.0).astype(np.uint8)
+    for c in range(3):
+        col = (tile[c] * shade) * (1.0 - grout) + mortar[c] * grout
+        img[:, :, c] = (np.clip(col, 0.0, 1.0) * 255.0).astype(np.uint8)
     img[:, :, 3] = 255
     return img
 
@@ -610,93 +605,83 @@ def on_close():
     float u = (x + 0.5f) / (float)width;
     float v = 1.0f - (y + 0.5f) / (float)height;
 
-    // Sample the water height field on a small stencil to get the surface
-    // gradient (slope -> refraction) and Laplacian (curvature -> caustics).
+    // Sample the water height field on a 3x3 stencil to get the surface
+    // gradient (slope -> refraction) and the full Hessian (the second
+    // derivatives that drive the caustic network).
     const float eps = 1.5f / (float)width;
     float hc = water_height(u, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
     float hl = water_height(u - eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
     float hr = water_height(u + eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
     float hd = water_height(u, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
     float hu = water_height(u, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
-
-    float gx = (hr - hl) / (2.0f * eps);   // d(height)/du
-    float gy = (hu - hd) / (2.0f * eps);   // d(height)/dv
-    // Discrete Laplacian (curvature). Divide by eps^2 so it is a true second
-    // derivative -- without this the finite-difference sum is ~Laplacian*eps^2
-    // (tiny), and the caustic term below would collapse to zero.
-    float lap = (hl + hr + hd + hu - 4.0f * hc) / (eps * eps);
-
-    // 2D refraction approximation: bend the background lookup by the surface
-    // slope, scaled by the user `strength`. Small factor keeps it gentle.
-    float refract = 0.015f * strength;
+    float hlu = water_height(u - eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hru = water_height(u + eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hld = water_height(u - eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hrd = water_height(u + eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+
+    float inv2e = 1.0f / (2.0f * eps);
+    float inve2 = 1.0f / (eps * eps);
+    float gx = (hr - hl) * inv2e;            // d(height)/du
+    float gy = (hu - hd) * inv2e;            // d(height)/dv
+    float hxx = (hr - 2.0f * hc + hl) * inve2;
+    float hyy = (hu - 2.0f * hc + hd) * inve2;
+    float hxy = (hru - hrd - hlu + hld) * (0.25f * inve2);
+
+    // 2D refraction: bend the background lookup by the surface slope, kept
+    // small so the pool floor warps gently instead of tearing apart. Because
+    // the texture was bound with srgb=True the sample is already in LINEAR
+    // light, so the lighting/tonemap below is physically sensible and we only
+    // re-encode to sRGB at the very end. MIRROR keeps (su, sv) outside [0,1]
+    // smooth instead of a clamped streak or a wrap seam.
+    float refract = 0.010f * strength;
     float su = u - refract * gx;
     float sv = v - refract * gy;
-
-    // Sample the background. LINEAR + MIRROR + normalized coords means the
-    // perturbed (su, sv) can leave [0, 1] and still return a smooth, mirrored
-    // pixel rather than a clamped streak or a hard seam. Because the texture was
-    // bound with srgb=True, each channel is already decoded to LINEAR light
-    // here -- so all the lighting/tonemap math below is physically sensible and
-    // we only re-encode to sRGB at the very end.
-    //
-    // Chromatic dispersion: water bends short (blue) wavelengths more than long
-    // (red) ones, so we sample R/G/B at slightly different refraction offsets.
-    // This gives caustic edges and warped grid lines faint rainbow fringes.
-    float disp = 0.30f * refract;                // dispersion spread, in UV
-    float base_r = tex2D<float4>(bg, su - disp * gx, sv - disp * gy).x;
-    float base_b = tex2D<float4>(bg, su + disp * gx, sv + disp * gy).z;
-    float4 base = tex2D<float4>(bg, su, sv);   // green keeps the unsplit UV
-    base.x = base_r;
-    base.z = base_b;
-
-    // Surface normal from the gradient (z component points out of the water).
-    float nx = -gx;
-    float ny = -gy;
-    float nz = 1.0f;
+    float4 base = tex2D<float4>(bg, su, sv);
+
+    // Caustics from the refraction map's area compression. The displacement
+    // (u,v) -> (su,sv) has Jacobian J = [[1 - r*hxx, -r*hxy], [-r*hxy,
+    // 1 - r*hyy]]. Where det(J) -> 0 neighbouring rays converge onto the same
+    // spot and light piles up; 1/|det| is the brightness of that focus. This
+    // is what produces the real, interconnected, animated caustic web -- not a
+    // generic glow. `rs` is a small lens strength tuned to the wave curvature.
+    float rs = 0.012f * (0.5f + 0.5f * strength);
+    float a = 1.0f - rs * hxx;
+    float dd = 1.0f - rs * hyy;
+    float bxy = rs * hxy;
+    float det = a * dd - bxy * bxy;
+    // The caustic is the thin CURVE where det -> 0 (rays focus to a line). We
+    // light up only a narrow band around it and square the ramp so the result
+    // is crisp bright filaments over the visible tiles, not broad foggy blobs.
+    // Two bands -- a tight bright core plus a fainter halo -- give the lines a
+    // little glow without fattening them.
+    float ad = fabsf(det);
+    float core = 1.0f - fminf(ad / 0.06f, 1.0f);
+    float halo = 1.0f - fminf(ad / 0.30f, 1.0f);
+    float caustic = core * core * 1.7f + halo * halo * 0.25f;
+    if (caustic > 2.0f) caustic = 2.0f;
+
+    // Surface normal from the gradient (z points out of the water).
+    float nx = -gx, ny = -gy, nz = 1.0f;
     float ninv = rsqrtf(nx * nx + ny * ny + nz * nz);
     nx *= ninv; ny *= ninv; nz *= ninv;
 
-    // Caustics: light focuses where the wavefront converges (negative
-    // curvature). Raise a sharpened function of the curvature to a power to get
-    // tight bright filaments, then add as a cyan/white highlight.
-    // The wave-sum Laplacian peaks around O(150-200), so this factor lands
-    // `focus` near O(1) at a converging wavefront.
-    float focus = -lap * 0.005f;
-    if (focus < 0.0f) focus = 0.0f;
-    float caustic = focus * focus * focus;       // sharpen into thin filaments
-    caustic *= (0.6f + 0.8f * strength);
-    if (caustic > 1.5f) caustic = 1.5f;
-
-    // Specular sparkle: normal vs a fixed light direction.
-    float lx = 0.4f, ly = 0.5f, lz = 0.768f;     // normalized-ish light dir
+    // Faint specular glints off the wavelets.
+    float lx = 0.3f, ly = 0.4f, lz = 0.866f;
     float spec = nx * lx + ny * ly + nz * lz;
     if (spec < 0.0f) spec = 0.0f;
-    spec = powf(spec, 48.0f);
-
-    // Animated light shafts / god-rays: angled bright bands that drift and
-    // breathe over time, as if sunlight were cutting down through the water.
-    // Built purely from (u, v, t) -- no extra launch args. The shafts are
-    // gated by the surface slope so they ripple with the waves and the water
-    // curvature concentrates them into bright filaments where the wavefront
-    // focuses, reinforcing the caustics.
-    float shaft_dir = u * 7.5f + v * 3.0f;       // angled across the screen
-    float shafts = 0.5f + 0.5f * sinf(shaft_dir + t * 0.7f + 1.5f * gx);
-    shafts *= 0.5f + 0.5f * sinf(shaft_dir * 0.37f - t * 0.4f);
-    shafts = powf(shafts, 3.0f);                 // crush into thin shafts
-    float godray = shafts * (0.18f + 0.45f * focus);
-
-    // Depth tint: deeper troughs read bluer/darker, crests slightly brighter.
-    float depth = 0.5f + 0.5f * hc;              // ~[0, 1]
-    float tint_r = 0.85f + 0.15f * depth;
-    float tint_g = 0.92f + 0.08f * depth;
-    float tint_b = 1.05f - 0.10f * depth;
-
-    // Composite in LINEAR light. Caustics get a faint warm/cool split and the
-    // god-rays a sunlit warm bias so the bright filaments read as light, not
-    // just blown-out white.
-    float cr = base.x * tint_r + caustic * 0.95f + spec * 0.9f + godray * 1.10f;
-    float cg = base.y * tint_g + caustic * 1.00f + spec * 0.9f + godray * 1.00f;
-    float cb = base.z * tint_b + caustic * 1.05f + spec * 1.0f + godray * 0.80f;
+    spec = powf(spec, 60.0f) * 0.5f;
+
+    // Water tint: a gentle blue-green cast, slightly deeper in the troughs.
+    float depth = 0.5f + 0.5f * hc;
+    float tint_r = 0.80f + 0.08f * depth;
+    float tint_g = 0.98f + 0.04f * depth;
+    float tint_b = 1.10f - 0.06f * depth;
+
+    // Composite in LINEAR light: tinted pool floor + the white caustic web
+    // (a touch cooler in blue so it reads as sunlight through water) + glints.
+    float cr = base.x * tint_r + caustic * 0.90f + spec;
+    float cg = base.y * tint_g + caustic * 0.97f + spec;
+    float cb = base.z * tint_b + caustic * 1.00f + spec;
 
     // Simple Reinhard tonemap so highlights roll off instead of clipping hard.
     cr = cr / (1.0f + cr);