From d55cd43e4d1a15f7ddd1d70e455b437bb6b90129 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 2 Jun 2026 18:10:38 +0200
Subject: [PATCH 01/17] Fix some threading issues (some free-threading related)

This fixes a few threading issues, but we may want to discuss some
details still.
* The GraphNode cleanup order is an important fix. Another thread may
  end up with the same pointer (but new object) as soon as we clean it
  up.  So we have to remove it from the cache before cleaning it up.
* Use of atomics: I think this is needed, but for this one place
  an atomic seemed more reasonable.  (However, hard to test and if
  it can fail IIUC only on ARM.)
* The critical sections should be pretty safe.  I am not sure they
  will all ensure that the object is always the _identity_ but I am
  pretty sure it protects from worse races.
  (Testing did find this for MemPool.attributes, not others yet.
  Testing with thread-sanitizer might flush out some...)
* The split mutex: This is thread-unsafe.  But I am honestly not
  sure if that isn't just expected, or whether the mutex is good
  but it should also be safe from within CUDA.
* Use of `setdefault` cached pattern is largely just normalizing.  Without
  the `return dict.setdefault` a different instance may be returned on
  different threads (or a cache entry replaced).
  For the `cyGraphMemoryResource` that triggered a test with pytest-run-parallel
  although that doesn't mean it is problematic as such.
  `cuda-pathfinder` uses functools.cache, but usually for strings;
  the one we may want to look at is `load_nvidia_dynamic_lib`.

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/cuda/core/_device.pyx                 | 17 ++++++++++-------
 cuda_core/cuda/core/_device_resources.pxd       |  3 +++
 cuda_core/cuda/core/_device_resources.pyx       |  9 +++++----
 cuda_core/cuda/core/_memory/_buffer.pxd         | 15 ++++++++-------
 cuda_core/cuda/core/_memory/_buffer.pyx         | 11 ++++++-----
 .../core/_memory/_graph_memory_resource.pyx     | 14 +++++++++++---
 cuda_core/cuda/core/_memory/_memory_pool.pyx    |  2 ++
 cuda_core/cuda/core/_memoryview.pyx             |  6 +++++-
 cuda_core/cuda/core/_module.pyx                 |  6 +++++-
 cuda_core/cuda/core/graph/_graph_node.pyx       |  8 ++++----
 10 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx
index da6972f3727..451ca25ddaa 100644
--- a/cuda_core/cuda/core/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -85,9 +85,12 @@ cdef class DeviceProperties:
 
     cdef inline int _get_cached_attribute(self, attr, default=0) except? -2:
         """Retrieve the attribute value, using cache if applicable."""
-        if attr not in self._cache:
-            self._cache[attr] = self._get_attribute(attr, default)
-        return self._cache[attr]
+        cached = self._cache.get(attr)
+        if cached is not None:
+            return cached
+        cdef int value = self._get_attribute(attr, default)
+        self._cache[attr] = value  # setdefault not needed for ints
+        return value
 
     @property
     def max_threads_per_block(self) -> int:
@@ -1131,11 +1134,11 @@ class Device:
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
         cdef DeviceProperties prop = self.properties
-        if "compute_capability" in prop._cache:
-            return prop._cache["compute_capability"]
+        cached = prop._cache.get("compute_capability")
+        if cached is not None:
+            return cached
         cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor)
-        prop._cache["compute_capability"] = cc
-        return cc
+        return prop._cache.setdefault("compute_capability", cc)
 
     @property
     def arch(self) -> str:
diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd
index d618c24cf10..98f91ab4733 100644
--- a/cuda_core/cuda/core/_device_resources.pxd
+++ b/cuda_core/cuda/core/_device_resources.pxd
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+cimport cython
+
 from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle
 
@@ -15,6 +17,7 @@ cdef class SMResource:
         unsigned int _flags
         bint _is_usable
         object __weakref__
+        cython.pymutex _split_mutex
 
     @staticmethod
     cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id)
diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx
index ecd9e00bf05..bafc462c936 100644
--- a/cuda_core/cuda/core/_device_resources.pyx
+++ b/cuda_core/cuda/core/_device_resources.pyx
@@ -498,10 +498,11 @@ cdef class SMResource:
         )
         _resolve_group_count(opts)
         _check_green_ctx_support()
-        if _can_use_structured_sm_split():
-            return _split_with_general_api(self, opts, dry_run)
-        # SplitByCount requires the same 12.4+ as green ctx support (already checked above)
-        return _split_with_count_api(self, opts, dry_run)
+        with self._split_mutex:
+            if _can_use_structured_sm_split():
+                return _split_with_general_api(self, opts, dry_run)
+            # SplitByCount requires the same 12.4+ as green ctx support (already checked above)
+            return _split_with_count_api(self, opts, dry_run)
 
 
 cdef class WorkqueueResource:
diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index 98c4b50db31..f744e756579 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uintptr_t
+from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release
 
 from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport DevicePtrHandle
@@ -18,13 +19,13 @@ cdef struct _MemAttrs:
 
 cdef class Buffer:
     cdef:
-        DevicePtrHandle _h_ptr
-        MemoryResource  _memory_resource
-        object          _ipc_data
-        object          _owner
-        _MemAttrs       _mem_attrs
-        bint            _mem_attrs_inited
-        object          __weakref__
+        DevicePtrHandle   _h_ptr
+        MemoryResource    _memory_resource
+        object            _ipc_data
+        object            _owner
+        _MemAttrs         _mem_attrs
+        std_atomic[bool]  _mem_attrs_inited
+        object            __weakref__
     cdef public:
         # Python code in _memory/_virtual_memory_resource.py needs to update
         # this value, though it is technically private.
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 88f9054385a..d9cf350b358 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -96,7 +96,7 @@ cdef class Buffer:
         self._memory_resource = None
         self._ipc_data = None
         self._owner = None
-        self._mem_attrs_inited = False
+        self._mem_attrs_inited.store(False)
 
     def __init__(self, *args, **kwargs) -> None:
         raise RuntimeError("Buffer objects cannot be instantiated directly. "
@@ -126,7 +126,7 @@ cdef class Buffer:
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
         self._owner = owner
-        self._mem_attrs_inited = False
+        self._mem_attrs_inited.store(False)
         return self
 
     @staticmethod
@@ -191,6 +191,7 @@ cdef class Buffer:
         return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
 
     @property
+    @cython.critical_section
     def ipc_descriptor(self) -> IPCBufferDescriptor:
         """Descriptor for sharing this buffer with other processes."""
         if self._ipc_data is None:
@@ -447,9 +448,9 @@ cdef class Buffer:
 # ------------------------------
 cdef inline void _init_mem_attrs(Buffer self):
     """Initialize memory attributes by querying the pointer."""
-    if not self._mem_attrs_inited:
+    if not self._mem_attrs_inited.load(memory_order_acquire):
         _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr))
-        self._mem_attrs_inited = True
+        self._mem_attrs_inited.store(True, memory_order_release)
 
 
 cdef inline int _query_memory_attrs(
@@ -597,7 +598,7 @@ cdef Buffer Buffer_from_deviceptr_handle(
     buf._memory_resource = mr
     buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
     buf._owner = None
-    buf._mem_attrs_inited = False
+    buf._mem_attrs_inited.store(False)
     return buf
 
 
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index 479322ab017..e845a47b080 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -18,7 +18,6 @@ from cuda.core._resource_handles cimport (
 from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
-from functools import cache
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -161,6 +160,8 @@ cdef class cyGraphMemoryResource(MemoryResource):
         return False
 
 
+cdef dict _mem_resource_cache = {}
+
 class GraphMemoryResource(cyGraphMemoryResource):
     """
     A memory resource for memory related to graphs.
@@ -185,9 +186,16 @@ class GraphMemoryResource(cyGraphMemoryResource):
         return cls._create(c_device_id)
 
     @classmethod
-    @cache
     def _create(cls, int device_id) -> GraphMemoryResource:
-        return cyGraphMemoryResource.__new__(cls, device_id)
+        # we use a dict currently, because functools.cache is currently less
+        # thread-safe see also: https://github.com/python/cpython/issues/150708
+        res = _mem_resource_cache.get(device_id)
+        if res is not None:
+            return res
+
+        # create new instance, but in case of a race may return another:
+        new = cyGraphMemoryResource.__new__(cls, device_id)
+        return _mem_resource_cache.setdefault(device_id, new)
 
 
 # Raise an exception if the given stream is capturing.
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index c6276f0f3de..ddcac2d6063 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+cimport cython
 from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset
@@ -177,6 +178,7 @@ cdef class _MemPool(MemoryResource):
         _MP_deallocate(self, <uintptr_t>ptr, size, s)
 
     @property
+    @cython.critical_section
     def attributes(self) -> _MemPoolAttributes:
         """Memory pool attributes."""
         if self._attributes is None:
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index c65107ae273..260980c1daf 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+cimport cython
 from ._dlpack cimport *
 from ._dlpack import classify_dl_device
 from libc.stdint cimport intptr_t
@@ -80,7 +81,7 @@ cdef inline bint _is_torch_tensor(object obj):
     cdef str mod = tp.__module__ or ""
     cdef bint result = mod.startswith("torch") and hasattr(obj, "data_ptr") \
         and _torch_version_check()
-    _torch_type_cache[tp] = result
+    _torch_type_cache[tp] = result  # setdefault not needed for bools
     return result
 
 
@@ -539,6 +540,7 @@ cdef class StridedMemoryView:
               + f"                  readonly={self.readonly},\n"
               + f"                  exporting_obj={get_simple_repr(self.exporting_obj)})")
 
+    @cython.critical_section
     cdef inline _StridedLayout get_layout(self):
         if self._layout is None:
             if self.dl_tensor:
@@ -549,6 +551,7 @@ cdef class StridedMemoryView:
                 raise ValueError("Cannot infer layout from the exporting object")
         return self._layout
 
+    @cython.critical_section
     cdef inline object get_buffer(self):
         """
         Returns Buffer instance with the underlying data.
@@ -562,6 +565,7 @@ cdef class StridedMemoryView:
                 self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj)
         return self._buffer
 
+    @cython.critical_section
     cdef inline object get_dtype(self):
         if self._dtype is None:
             if self.dl_tensor != NULL:
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 5cb1b7f0059..91c8ad43895 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+cimport cython
 from libc.stddef cimport size_t
 
 from collections import namedtuple
@@ -83,7 +84,7 @@ cdef class KernelAttributes:
         cdef int result
         with nogil:
             HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id))
-        self._cache[cache_key] = result
+        self._cache[cache_key] = result  # setdefault not needed for ints
         return result
 
     def __getitem__(self, device: Device | int) -> KernelAttributes:
@@ -454,6 +455,7 @@ cdef class Kernel:
         return ker
 
     @property
+    @cython.critical_section
     def attributes(self) -> KernelAttributes:
         """Get the read-only attributes of this kernel."""
         if self._attributes is None:
@@ -501,6 +503,7 @@ cdef class Kernel:
         return param_info
 
     @property
+    @cython.critical_section
     def occupancy(self) -> KernelOccupancy:
         """Get the occupancy information for launching this kernel."""
         if self._occupancy is None:
@@ -742,6 +745,7 @@ cdef class ObjectCode:
 
     # TODO: do we want to unload in a finalizer? Probably not..
 
+    @cython.critical_section
     cdef int _lazy_load_module(self) except -1:
         if self._h_library:
             return 0
diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx
index f627edf9bb2..d3d684aff3e 100644
--- a/cuda_core/cuda/core/graph/_graph_node.pyx
+++ b/cuda_core/cuda/core/graph/_graph_node.pyx
@@ -78,8 +78,7 @@ _node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueD
 
 
 cdef inline GraphNode _registered(GraphNode n):
-    _node_registry[<uintptr_t>n._h_node.get()] = n
-    return n
+    return _node_registry.setdefault(<uintptr_t>n._h_node.get(), n)
 
 
 cdef class GraphNode:
@@ -162,10 +161,11 @@ cdef class GraphNode:
         cdef cydriver.CUgraphNode node = as_cu(self._h_node)
         if node == NULL:
             return
-        with nogil:
-            HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))
+
         _node_registry.pop(<uintptr_t>self._h_node.get(), None)
         invalidate_graph_node(self._h_node)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))
 
     @property
     def pred(self) -> AdjacencySetProxy:

From 0311c2b9d9adbb749411c9e60e31a85125eaeac3 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 2 Jun 2026 19:11:36 +0200
Subject: [PATCH 02/17] Use C++ bool for atomic

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/cuda/core/_memory/_buffer.pxd | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index f744e756579..83dcd4f68c2 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uintptr_t
+from libcpp cimport bool as cpp_bool
 from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release
 
 from cuda.bindings cimport cydriver
@@ -19,13 +20,13 @@ cdef struct _MemAttrs:
 
 cdef class Buffer:
     cdef:
-        DevicePtrHandle   _h_ptr
-        MemoryResource    _memory_resource
-        object            _ipc_data
-        object            _owner
-        _MemAttrs         _mem_attrs
-        std_atomic[bool]  _mem_attrs_inited
-        object            __weakref__
+        DevicePtrHandle       _h_ptr
+        MemoryResource        _memory_resource
+        object                _ipc_data
+        object                _owner
+        _MemAttrs             _mem_attrs
+        std_atomic[cpp_bool]  _mem_attrs_inited
+        object                __weakref__
     cdef public:
         # Python code in _memory/_virtual_memory_resource.py needs to update
         # this value, though it is technically private.

From ef66665830ae30454694e9d505c5329c72e1c98b Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 22:15:03 +0200
Subject: [PATCH 03/17] Forgot to commit a critical section on this branch

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index d9cf350b358..00359c1f0bf 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -446,6 +446,7 @@ cdef class Buffer:
 
 # Memory Attribute Query Helpers
 # ------------------------------
+@cython.critical_section
 cdef inline void _init_mem_attrs(Buffer self):
     """Initialize memory attributes by querying the pointer."""
     if not self._mem_attrs_inited.load(memory_order_acquire):

From c98ca0e116b4d3db875e384872f0c0fcb0430a76 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:18:17 +0200
Subject: [PATCH 04/17] DEV: run_tests.sh uses pytest-run-parllel and install
 it on 3.14t and 3.15t

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/pyproject.toml   |  1 +
 cuda_core/pyproject.toml       |  1 +
 cuda_pathfinder/pyproject.toml |  2 ++
 scripts/run_tests.sh           | 35 +++++++++++++++++++++++++++++++---
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index d40984d6be4..8e4d481fa9a 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -52,6 +52,7 @@ test = [
     "pytest>=6.2.4",
     "pytest-benchmark>=3.4.1",
     "pytest-repeat",
+    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "pytest-randomly",
     "pyglet>=2.1.9",
 ]
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 94424d8488e..cedb725ffb1 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -61,6 +61,7 @@ cu13 = ["cuda-bindings[all]==13.*", "cuda-toolkit==13.*"]
 test = [
     "cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark",
     "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout",
+    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "cloudpickle", "psutil",
     # TODO: remove the Python 3.15 guard once 3.15 is officially supported
     "cffi; python_version < '3.15'",
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index dfbef9dd18d..c6bb9d98320 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -15,6 +15,7 @@ test = [
     "pytest>=6.2.4",
     "pytest-mock",
     "pytest-repeat",
+    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "pytest-randomly",
 ]
 # Internal organization of test dependencies.
@@ -102,6 +103,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma
 
 [tool.pytest.ini_options]
 addopts = "--showlocals"
+thread_unsafe_fixtures = ['mocker']
 
 [tool.mypy]
 # Try to keep the mypy configuration similar between the subprojects
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 163ff70a997..44bdf08d810 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 
 # Simple, dependency-free orchestrator to run tests for all packages.
 # Usage:
-#   scripts/run_tests.sh [ -v|--verbose ] [ --install | --no-install ] [ --with-cython | --skip-cython ] [ --with-examples | --skip-examples ] [ --with-ptds ]
+#   scripts/run_tests.sh [ -v|--verbose ] [ --install | --no-install ] [ --with-cython | --skip-cython ] [ --with-examples | --skip-examples ] [ --with-ptds ] [ --parallel-threads=N ]
 #   scripts/run_tests.sh [ flags ]                   # pathfinder -> bindings -> core
 #   scripts/run_tests.sh [ flags ] core              # only core
 #   scripts/run_tests.sh [ flags ] bindings          # only bindings
@@ -38,6 +38,9 @@ Options:
       --with-examples Run examples where applicable (e.g., cuda_bindings/examples)
       --skip-examples Skip running examples (default)
       --with-ptds     Re-run cuda_bindings tests with PTDS (CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=1)
+      --parallel-threads=N
+                     Run pytest with --parallel-threads=N. Defaults to 4 when
+                     pytest-run-parallel is installed, otherwise 0 (disabled).
   -h, --help          Show this help and exit
 
 Examples:
@@ -54,6 +57,11 @@ RUN_CYTHON=0
 RUN_EXAMPLES=1
 RUN_PTDS=1
 INSTALL_MODE=auto  # auto|force|skip
+DEFAULT_PARALLEL_THREADS=0
+if python -mpip show pytest-run-parallel >/dev/null 2>&1; then
+  DEFAULT_PARALLEL_THREADS=4
+fi
+PARALLEL_THREADS=${DEFAULT_PARALLEL_THREADS}
 while [[ $# -gt 0 ]]; do
   case "$1" in
     -h|--help)
@@ -92,6 +100,18 @@ while [[ $# -gt 0 ]]; do
       RUN_PTDS=1
       shift
       ;;
+    --parallel-threads=*)
+      PARALLEL_THREADS="${1#*=}"
+      shift
+      ;;
+    --parallel-threads)
+      if [[ $# -lt 2 ]]; then
+        echo "Missing value for --parallel-threads" >&2
+        exit 1
+      fi
+      PARALLEL_THREADS="$2"
+      shift 2
+      ;;
     *)
       break
       ;;
@@ -100,12 +120,21 @@ done
 
 target=${1:-all}
 
+if ! [[ "${PARALLEL_THREADS}" =~ ^[0-9]+$ ]]; then
+  echo "--parallel-threads must be a non-negative integer, got: ${PARALLEL_THREADS}" >&2
+  exit 1
+fi
+
 if [[ ${VERBOSE} -eq 1 ]]; then
   PYTEST_FLAGS=( -ra -s -v )
 else
   # Very quiet: show failures/errors summary only
   PYTEST_FLAGS=( -qq )
 fi
+PYTEST_PARALLEL_FLAGS=()
+if [[ "${PARALLEL_THREADS}" -gt 0 ]]; then
+  PYTEST_PARALLEL_FLAGS=( "--parallel-threads=${PARALLEL_THREADS}" )
+fi
 
 declare -A RESULTS
 ORDERED_RESULTS=()
@@ -133,7 +162,7 @@ status_from_rc() {
 run_pytest() {
   # Run pytest safely under set -e and return its exit code
   set +e
-  CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=0 python -m pytest "${PYTEST_FLAGS[@]}" "$@"
+  CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=0 python -m pytest "${PYTEST_FLAGS[@]}" "${PYTEST_PARALLEL_FLAGS[@]}" "$@"
   local rc=$?
   set -e
   return ${rc}
@@ -142,7 +171,7 @@ run_pytest() {
 run_pytest_ptds() {
   # Run pytest with PTDS env set; safely return its exit code
   set +e
-  CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=1 python -m pytest "${PYTEST_FLAGS[@]}" "$@"
+  CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=1 python -m pytest "${PYTEST_FLAGS[@]}" "${PYTEST_PARALLEL_FLAGS[@]}" "$@"
   local rc=$?
   set -e
   return ${rc}

From da8136791c8cdea17bff57e9e860b14d674203bb Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:25:40 +0200
Subject: [PATCH 05/17] TST: Add mini plugins to push custom fixtures into
 tests

E.g. cuda needs to be initialized for each thread, but fixtures
run before pytest-run-parallel launches the threads.
So we create a mini-plugin to deal with this.  We could also solve
this with decorators in many cases, but that would require adding
a lot of decorators...

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/conftest.py | 83 ++++++++++++++++++++++++++++-
 cuda_core/tests/conftest.py     | 94 ++++++++++++++++++++++++++-------
 2 files changed, 156 insertions(+), 21 deletions(-)

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
index f30500c1342..f63beca6584 100644
--- a/cuda_bindings/tests/conftest.py
+++ b/cuda_bindings/tests/conftest.py
@@ -1,8 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import functools
+import inspect
 import pathlib
 import sys
+from contextlib import contextmanager
 from importlib.metadata import PackageNotFoundError, distribution
 
 import pytest
@@ -25,6 +28,84 @@
         sys.path.insert(0, test_helpers_root)
 
 
+def _parallel_threads_enabled(config):
+    parallel_threads = getattr(config.option, "parallel_threads", 0)
+    if parallel_threads == "auto":
+        return True
+    return parallel_threads is not None and int(parallel_threads) > 0
+
+
+def pytest_configure(config):
+    if _parallel_threads_enabled(config):
+        config.pluginmanager.register(_CudaBindingsParallelPlugin(), name="_cuda_bindings_parallel_plugin")
+
+
+@contextmanager
+def _thread_context():
+    # Defensive: if this worker thread already has an active context (e.g. from
+    # double-wrapping), reuse it rather than pushing another one.
+    # Note: fixtures never run on the test thread; this is purely a safety net.
+    err, existing = cuda.cuCtxGetCurrent()
+    if err == cuda.CUresult.CUDA_SUCCESS and existing and int(existing) != 0:
+        yield None, existing
+        return
+
+    # cuInit(0) is idempotent; safe to call even if cuda_driver fixture already ran.
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    try:
+        yield device, ctx
+    finally:
+        (err,) = cuda.cuCtxDestroy(ctx)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+def _wrap_worker_cuda_test(func):
+    if getattr(func, "_cuda_bindings_worker_cuda_wrapped", False):
+        return func
+
+    sig = inspect.signature(func)
+    wants_device = "device" in sig.parameters
+    wants_ctx = "ctx" in sig.parameters
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with _thread_context() as (device, ctx):
+            # device is None when reusing an existing context (defensive path);
+            # keep whatever the fixture provided in kwargs as-is.
+            if wants_device and device is not None:
+                kwargs["device"] = device
+            if wants_ctx:
+                kwargs["ctx"] = ctx
+            return func(*args, **kwargs)
+
+    wrapper._cuda_bindings_worker_cuda_wrapped = True
+    return wrapper
+
+
+def _item_needs_thread_ctx(item):
+    fixturenames = getattr(item, "fixturenames", ())
+    # 'device' is present when the module-level ctx(device) autouse chain is
+    # active (test_cuda.py, test_kernelParams.py, nvml tests, …).
+    # 'driver' is present for test_cufile.py tests that use the local driver
+    # fixture; their local ctx() shadows the parent ctx(device) so 'device'
+    # does not appear in their fixture chain, but they still need a per-thread
+    # CUDA context for cuMemAlloc and similar calls made inside the test.
+    return "device" in fixturenames or "driver" in fixturenames
+
+
+class _CudaBindingsParallelPlugin:
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_collection_modifyitems(self, config, items):
+        for item in items:
+            if _item_needs_thread_ctx(item):
+                item.obj = _wrap_worker_cuda_test(item.obj)
+
+
 @pytest.fixture(scope="module")
 def cuda_driver():
     (err,) = cuda.cuInit(0)
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index d7a81d88904..bf87677fd8d 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import multiprocessing
 import os
 import pathlib
@@ -91,6 +92,68 @@ def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
         sys.path.insert(0, test_helpers_root)
 
 
+def pytest_configure(config):
+    if _parallel_threads_enabled(config):
+        config.pluginmanager.register(_CudaCoreParallelPlugin(), name="_cuda_core_parallel_plugin")
+
+
+def _parallel_threads_enabled(config):
+    parallel_threads = getattr(config.option, "parallel_threads", 0)
+    if parallel_threads == "auto":
+        return True
+
+    return parallel_threads is not None and int(parallel_threads) > 0
+
+
+@contextmanager
+def _init_cuda_context():
+    # TODO: rename this to e.g. init_context
+    device = Device(0)
+    device.set_current()
+
+    # Set option to avoid spin-waiting on synchronization.
+    if int(os.environ.get("CUDA_CORE_TEST_BLOCKING_SYNC", 0)) != 0:
+        handle_return(
+            driver.cuDevicePrimaryCtxSetFlags(device.device_id, driver.CUctx_flags.CU_CTX_SCHED_BLOCKING_SYNC)
+        )
+
+    try:
+        yield device
+    finally:
+        _ = _device_unset_current()
+
+
+def _wrap_worker_cuda_test(func):
+    if getattr(func, "_cuda_core_worker_cuda_wrapped", False):
+        return func
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with _init_cuda_context() as device:
+            if "init_cuda" in kwargs:
+                kwargs["init_cuda"] = device
+            if "mempool_device_x2" in kwargs:
+                kwargs["mempool_device_x2"] = _mempool_device_impl(2)
+            if "mempool_device_x3" in kwargs:
+                kwargs["mempool_device_x3"] = _mempool_device_impl(3)
+            return func(*args, **kwargs)
+
+    wrapper._cuda_core_worker_cuda_wrapped = True
+    return wrapper
+
+
+def _item_uses_init_cuda(item):
+    return "init_cuda" in getattr(item, "fixturenames", ())
+
+
+class _CudaCoreParallelPlugin:
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_collection_modifyitems(self, config, items):
+        for item in items:
+            if _item_uses_init_cuda(item):
+                item.obj = _wrap_worker_cuda_test(item.obj)
+
+
 def skip_if_pinned_memory_unsupported(device):
     try:
         if not device.properties.host_memory_pools_supported:
@@ -194,18 +257,8 @@ def session_setup():
 
 @pytest.fixture
 def init_cuda():
-    # TODO: rename this to e.g. init_context
-    device = Device(0)
-    device.set_current()
-
-    # Set option to avoid spin-waiting on synchronization.
-    if int(os.environ.get("CUDA_CORE_TEST_BLOCKING_SYNC", 0)) != 0:
-        handle_return(
-            driver.cuDevicePrimaryCtxSetFlags(device.device_id, driver.CUctx_flags.CU_CTX_SCHED_BLOCKING_SYNC)
-        )
-
-    yield device
-    _ = _device_unset_current()
+    with _init_cuda_context() as device:
+        yield device
 
 
 def _device_unset_current() -> bool:
@@ -247,7 +300,7 @@ def pop_all_contexts():
 
 
 @pytest.fixture
-def ipc_device():
+def ipc_device(init_cuda):
     """Obtains a device suitable for IPC-enabled mempool tests, or skips.
 
     The fixture also tracks every ``multiprocessing.Process`` spawned during
@@ -257,8 +310,7 @@ def ipc_device():
     """
     from helpers.child_processes import track_child_processes
 
-    device = Device(0)
-    device.set_current()
+    device = init_cuda
 
     if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
@@ -290,13 +342,15 @@ def ipc_memory_resource(request, ipc_device):
     assert mr.is_ipc_enabled
     yield mr
     mr.close()
+    # TODO(seberg): Make sure the `mr` and it's buffers are fully torn down.
+    # May be unnecessary as `mr.close()` is not parallel with other work.
+    ipc_device.sync()
 
 
 @pytest.fixture
-def mempool_device():
+def mempool_device(init_cuda):
     """Obtains a device suitable for mempool tests, or skips."""
-    device = Device(0)
-    device.set_current()
+    device = init_cuda
 
     if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
@@ -323,13 +377,13 @@ def _mempool_device_impl(num):
 
 
 @pytest.fixture
-def mempool_device_x2():
+def mempool_device_x2(init_cuda):
     """Fixture that provides two devices if available, otherwise skips test."""
     return _mempool_device_impl(2)
 
 
 @pytest.fixture
-def mempool_device_x3():
+def mempool_device_x3(init_cuda):
     """Fixture that provides three devices if available, otherwise skips test."""
     return _mempool_device_impl(3)
 

From 991d5bcd3df2c58321c7f421b690d3529035e623 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:34:12 +0200
Subject: [PATCH 06/17] TST: Mark tests as thread-unsafe or limit the number of
 threads

- thread_unsafe: nvml init ref-count, graphMem attr, mock-based tests,
  OpenGL, peer-access pool state, multiprocessing warning, program-cache
  race reproduction, and functools.cache mutation tests
- parallel_threads_limit: IPC / worker-pool tests that spawn subprocesses
  or open file descriptors (limit 4), example tests (limit 8), and the
  event-registration test whose timeouts are slow

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/nvml/test_init.py                   | 1 +
 cuda_bindings/tests/test_cuda.py                        | 1 +
 cuda_core/tests/example_tests/test_basic_examples.py    | 1 +
 cuda_core/tests/memory_ipc/test_errors.py               | 4 ++++
 cuda_core/tests/memory_ipc/test_event_ipc.py            | 4 ++++
 cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py | 3 +++
 cuda_core/tests/memory_ipc/test_leaks.py                | 2 ++
 cuda_core/tests/memory_ipc/test_memory_ipc.py           | 3 +++
 cuda_core/tests/memory_ipc/test_peer_access.py          | 3 +++
 cuda_core/tests/memory_ipc/test_send_buffers.py         | 3 +++
 cuda_core/tests/memory_ipc/test_serialize.py            | 3 +++
 cuda_core/tests/memory_ipc/test_workerpool.py           | 3 +++
 cuda_core/tests/system/test_system_device.py            | 1 +
 cuda_core/tests/test_graphics.py                        | 3 +++
 cuda_core/tests/test_memory.py                          | 1 +
 cuda_core/tests/test_memory_peer_access.py              | 2 ++
 cuda_core/tests/test_multiprocessing_warning.py         | 5 +++++
 cuda_core/tests/test_program_cache.py                   | 1 +
 cuda_pathfinder/tests/test_find_nvidia_binaries.py      | 1 +
 19 files changed, 45 insertions(+)

diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
index 4c94dc26a3e..19e573c9cc6 100644
--- a/cuda_bindings/tests/nvml/test_init.py
+++ b/cuda_bindings/tests/nvml/test_init.py
@@ -42,6 +42,7 @@ def get_architecture_name(arch):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
+@pytest.mark.thread_unsafe(reason="nvml init affects other threads")
 def test_init_ref_count():
     """
     Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 32b05f638fe..4d11d08b287 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device):
 
 
 @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
+@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.")
 def test_cuda_graphMem_attr(device):
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 43fab4241db..c8d15677a54 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool:
 
 
 @pytest.mark.parametrize("example", sample_files)
+@pytest.mark.parallel_threads_limit(8)
 def test_example(example):
     example_path = os.path.join(samples_path, example)
     has_package_requirements_or_skip(example_path)
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index 42f34dd61c2..4fc119fc170 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -16,6 +16,10 @@
 POOL_SIZE = 2097152
 
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 def test_outer_timeout_marker_is_applied(request):
     """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
 
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 48985e67b58..e3cefe6a211 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -16,6 +16,10 @@
 NBYTES = 64
 
 
+# these tetss spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)")
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
index 8d450fa8e3f..eaa6ddec92f 100644
--- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -24,6 +24,9 @@
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 def child_main(log, queue):
     log.prefix = " child: "
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 6fc4d03f142..c6e44824137 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -23,6 +23,8 @@
     not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable"
 )
 
+pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.")
+
 
 @pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 0923fe28d8b..43d356789e7 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -14,6 +14,9 @@
 NWORKERS = 2
 NTASKS = 2
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcMempool:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 9e9e2879ae7..efb67b4cdb8 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -14,6 +14,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestPeerAccessNotPreservedOnImport:
     """
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index cc7f45d67c2..01c9496e773 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -16,6 +16,9 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcSendBuffers:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 2f0e429b103..4289de4b5a9 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -15,6 +15,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestObjectSerializationDirect:
     """
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 609fadbcf3e..08d9bd79d92 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,9 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcWorkerPool:
     """
diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
index 4aa13840b48..c202407dc55 100644
--- a/cuda_core/tests/system/test_system_device.py
+++ b/cuda_core/tests/system/test_system_device.py
@@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value():
         _device._unpack_bitmask(1)
 
 
+@pytest.mark.parallel_threads_limit(4)  # timeouts are slow
 @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows")
 def test_register_events():
     # This is not the world's greatest test.  All of the events are pretty
diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py
index 6f5877f76b0..e2b22a20c59 100644
--- a/cuda_core/tests/test_graphics.py
+++ b/cuda_core/tests/test_graphics.py
@@ -20,6 +20,9 @@
 )
 from cuda.core.utils import StridedMemoryView
 
+# TODO(seberg): Maybe some of these tests can be made threadable?
+pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable")
+
 # ---------------------------------------------------------------------------
 # GL context + buffer helpers
 # ---------------------------------------------------------------------------
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 920cd4bb0fd..35592485c94 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+@pytest.mark.thread_unsafe(reason="Uses mock.")
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 71beb459143..68c32ce69c6 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -12,6 +12,8 @@
 
 NBYTES = 1024
 
+pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state")
+
 
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 0f96e0abfbc..94a671ff2f8 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,12 +12,17 @@
 import warnings
 from unittest.mock import patch
 
+import pytest
+
 from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
 from cuda.core._event import _reduce_event
 from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
 from cuda.core._memory._ipc import _reduce_allocation_handle
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method, reset_fork_warning
 
+# We could move these to a (session) fixtures
+pytestmark = pytest.mark.thread_unsafe(reason="all tests use unittest.mock.patch")
+
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
     """Test that warning is emitted when DeviceMemoryResource is pickled with fork method."""
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 01a39e0032c..963ec1cc04b 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path):
         assert cache._tracked_size_bytes <= 1100  # actual on-disk is 'b' + 'c' or just 'c'
 
 
+@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream")
 def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path):
     """Two-thread reproduction of the ``__delitem__`` vs
     ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index ec9740cd853..0f9e5ed31c1 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker):
 
 
 @pytest.mark.usefixtures("clear_find_binary_cache")
+@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.")
 def test_caching_per_utility():
     """Verify that different utilities have independent cache entries."""
     nvdisasm1 = find_nvidia_binary_utility("nvdisasm")

From 84a63e2e59d1e79a8bd11e44165b7fdc49f1da6f Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:36:55 +0200
Subject: [PATCH 07/17] TST: use tmp_path fixture in cufile (and mark some as
 unsafe)

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/test_cufile.py | 110 ++++++++---------------------
 1 file changed, 31 insertions(+), 79 deletions(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 6e614ca1b05..ee0995234b3 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -1,8 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import ctypes
-import errno
 import logging
 import os
 import pathlib
@@ -145,7 +144,7 @@ def ctx():
     (err,) = cuda.cuCtxSetCurrent(ctx)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
-    yield
+    yield ctx
 
     cuda.cuDevicePrimaryCtxRelease(device)
 
@@ -205,10 +204,10 @@ def driver(ctx):
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_handle_register():
+def test_handle_register(tmpdir):
     """Test file handle registration with cuFile."""
     # Create test file
-    file_path = "test_handle_register.bin"
+    file_path = tmpdir / "test_handle_register.bin"
 
     # Create file with POSIX operations
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
@@ -242,8 +241,6 @@ def test_handle_register():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.usefixtures("driver")
@@ -398,10 +395,10 @@ def test_buf_register_already_registered():
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_cufile_read_write():
+def test_cufile_read_write(tmpdir):
     """Test cuFile read and write operations."""
     # Create test file
-    file_path = "test_cufile_rw.bin"
+    file_path = tmpdir / "test_cufile_rw.bin"
 
     # Allocate CUDA memory for write and read
     write_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
@@ -478,21 +475,15 @@ def test_cufile_read_write():
         # Free CUDA memory
         cuda.cuMemFree(write_buf)
         cuda.cuMemFree(read_buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_cufile_read_write_host_memory():
+def test_cufile_read_write_host_memory(tmpdir):
     """Test cuFile read and write operations using host memory."""
     # Create test file
-    file_path = "test_cufile_rw_host.bin"
+    file_path = tmpdir / "test_cufile_rw_host.bin"
 
     # Allocate host memory for write and read
     write_size = 65536  # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
@@ -565,21 +556,15 @@ def test_cufile_read_write_host_memory():
         # Free host memory
         cuda.cuMemFreeHost(write_buf)
         cuda.cuMemFreeHost(read_buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_cufile_read_write_large():
+def test_cufile_read_write_large(tmpdir):
     """Test cuFile read and write operations with large data."""
     # Create test file
-    file_path = "test_cufile_rw_large.bin"
+    file_path = tmpdir / "test_cufile_rw_large.bin"
 
     # Allocate large CUDA memory (1MB, aligned to 4096 bytes)
     write_size = 1024 * 1024  # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0)
@@ -659,21 +644,15 @@ def test_cufile_read_write_large():
         # Free CUDA memory
         cuda.cuMemFree(write_buf)
         cuda.cuMemFree(read_buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
 @xfail_handle_register
-def test_cufile_write_async():
+def test_cufile_write_async(tmpdir):
     """Test cuFile asynchronous write operations."""
     # Create test file
-    file_path = "test_cufile_write_async.bin"
+    file_path = tmpdir / "test_cufile_write_async.bin"
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
 
     try:
@@ -741,17 +720,15 @@ def test_cufile_write_async():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
 @xfail_handle_register
-def test_cufile_read_async():
+def test_cufile_read_async(tmpdir):
     """Test cuFile asynchronous read operations."""
     # Create test file
-    file_path = "test_cufile_read_async.bin"
+    file_path = tmpdir / "test_cufile_read_async.bin"
 
     # First create and write test data without O_DIRECT
     fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
@@ -832,17 +809,15 @@ def test_cufile_read_async():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @xfail_handle_register
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
-def test_cufile_async_read_write():
+def test_cufile_async_read_write(tmpdir):
     """Test cuFile asynchronous read and write operations in sequence."""
     # Create test file
-    file_path = "test_cufile_async_rw.bin"
+    file_path = tmpdir / "test_cufile_async_rw.bin"
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
 
     try:
@@ -946,17 +921,15 @@ def test_cufile_async_read_write():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_batch_io_basic():
+def test_batch_io_basic(tmpdir):
     """Test basic batch IO operations with multiple read/write operations."""
     # Create test file
-    file_path = "test_batch_io.bin"
+    file_path = tmpdir / "test_batch_io.bin"
 
     # Allocate CUDA memory for multiple operations
     buf_size = 65536  # 64KB
@@ -1145,21 +1118,15 @@ def test_batch_io_basic():
         # Free CUDA memory
         for buf in buffers + read_buffers:
             cuda.cuMemFree(buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_batch_io_cancel():
+def test_batch_io_cancel(tmpdir):
     """Test batch IO cancellation."""
     # Create test file
-    file_path = "test_batch_cancel.bin"
+    file_path = tmpdir / "test_batch_cancel.bin"
 
     # Allocate CUDA memory
     buf_size = 4096  # 4KB, aligned to 4096 bytes
@@ -1229,21 +1196,15 @@ def test_batch_io_cancel():
         # Free CUDA memory
         for buf in buffers:
             cuda.cuMemFree(buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
 @xfail_handle_register
-def test_batch_io_large_operations():
+def test_batch_io_large_operations(tmpdir):
     """Test batch IO with large buffer operations."""
     # Create test file
-    file_path = "test_batch_large.bin"
+    file_path = tmpdir / "test_batch_large.bin"
 
     # Allocate large CUDA memory (1MB, aligned to 4096 bytes)
     buf_size = 1024 * 1024  # 1MB, aligned to 4096 bytes
@@ -1421,12 +1382,6 @@ def test_batch_io_large_operations():
         # Free CUDA memory
         for buf in all_buffers:
             cuda.cuMemFree(buf)
-        # Clean up test file
-        try:
-            os.unlink(file_path)
-        except OSError as e:
-            if e.errno != errno.ENOENT:
-                raise
 
 
 @pytest.mark.skipif(
@@ -1648,10 +1603,11 @@ def test_stats_start_stop():
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
 @xfail_handle_register
-def test_get_stats_l1():
+@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
+def test_get_stats_l1(tmpdir):
     """Test cuFile L1 statistics retrieval with file operations."""
     # Create test file directly with O_DIRECT
-    file_path = "test_stats_l1.bin"
+    file_path = tmpdir / "test_stats_l1.bin"
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
 
     try:
@@ -1718,8 +1674,6 @@ def test_get_stats_l1():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(
@@ -1728,10 +1682,11 @@ def test_get_stats_l1():
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
 @xfail_handle_register
-def test_get_stats_l2():
+@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
+def test_get_stats_l2(tmpdir):
     """Test cuFile L2 statistics retrieval with file operations."""
     # Create test file directly with O_DIRECT
-    file_path = "test_stats_l2.bin"
+    file_path = tmpdir / "test_stats_l2.bin"
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
 
     try:
@@ -1802,8 +1757,6 @@ def test_get_stats_l2():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(
@@ -1812,10 +1765,11 @@ def test_get_stats_l2():
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
 @xfail_handle_register
-def test_get_stats_l3():
+@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
+def test_get_stats_l3(tmpdir):
     """Test cuFile L3 statistics retrieval with file operations."""
     # Create test file directly with O_DIRECT
-    file_path = "test_stats_l3.bin"
+    file_path = tmpdir / "test_stats_l3.bin"
     fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
 
     try:
@@ -1896,8 +1850,6 @@ def test_get_stats_l3():
 
     finally:
         os.close(fd)
-        with suppress(OSError):
-            os.unlink(file_path)
 
 
 @pytest.mark.skipif(

From 62790307ddbf118002f36b217bc4cb6c488bd50f Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:38:28 +0200
Subject: [PATCH 08/17] TST: Move graph definnitions inline and mark "global"
 ones as thread-unsafe always

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 .../tests/graph/test_graph_definition.py      | 107 +++++++++++-------
 .../tests/graph/test_graph_memory_resource.py |  29 +++--
 2 files changed, 82 insertions(+), 54 deletions(-)

diff --git a/cuda_core/tests/graph/test_graph_definition.py b/cuda_core/tests/graph/test_graph_definition.py
index da78bea577f..b18b2ac9bda 100644
--- a/cuda_core/tests/graph/test_graph_definition.py
+++ b/cuda_core/tests/graph/test_graph_definition.py
@@ -575,20 +575,6 @@ def node_spec(request, init_cuda):
 # =============================================================================
 
 
-@pytest.fixture
-def sample_graphdef(init_cuda):
-    """A sample GraphDefinition for standalone tests."""
-    return GraphDefinition()
-
-
-@pytest.fixture
-def dot_file(tmp_path):
-    """Temporary DOT file path, cleaned up after test."""
-    path = tmp_path / "graph.dot"
-    yield path
-    path.unlink(missing_ok=True)
-
-
 # =============================================================================
 # Topology tests (parameterized over graph specs)
 # =============================================================================
@@ -775,14 +761,16 @@ def registered(node):
 # =============================================================================
 
 
-def test_graphdef_handle_valid(sample_graphdef):
+def test_graphdef_handle_valid(init_cuda):
     """GraphDefinition has a valid non-null handle."""
+    sample_graphdef = GraphDefinition()
     assert sample_graphdef.handle is not None
     assert int(sample_graphdef.handle) != 0
 
 
-def test_graphdef_entry_is_virtual(sample_graphdef):
+def test_graphdef_entry_is_virtual(init_cuda):
     """Internal entry node is virtual (no pred/succ, type is None)."""
+    sample_graphdef = GraphDefinition()
     entry = sample_graphdef._entry
     assert isinstance(entry, GraphNode)
     assert entry.pred == set()
@@ -795,8 +783,9 @@ def test_graphdef_entry_is_virtual(sample_graphdef):
 # =============================================================================
 
 
-def test_alloc_zero_size_fails(sample_graphdef):
+def test_alloc_zero_size_fails(init_cuda):
     """Alloc with zero size raises error (CUDA limitation)."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     from cuda.core._utils.cuda_utils import CUDAError
 
@@ -804,8 +793,9 @@ def test_alloc_zero_size_fails(sample_graphdef):
         sample_graphdef.allocate(0)
 
 
-def test_free_creates_dependency(sample_graphdef):
+def test_free_creates_dependency(init_cuda):
     """Free node depends on its predecessor."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         alloc = sample_graphdef.allocate(ALLOC_SIZE)
@@ -813,8 +803,9 @@ def test_free_creates_dependency(sample_graphdef):
     assert alloc in free.pred
 
 
-def test_alloc_free_chain(sample_graphdef):
+def test_alloc_free_chain(init_cuda):
     """Alloc and free can be chained."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         a1 = sample_graphdef.allocate(ALLOC_SIZE)
@@ -831,8 +822,9 @@ def test_alloc_free_chain(sample_graphdef):
 # =============================================================================
 
 
-def test_alloc_memory_type_invalid(sample_graphdef):
+def test_alloc_memory_type_invalid(init_cuda):
     """Invalid memory type raises ValueError."""
+    sample_graphdef = GraphDefinition()
     with pytest.raises(ValueError, match="Invalid memory_type"):
         sample_graphdef.allocate(ALLOC_SIZE, memory_type="invalid")
 
@@ -844,8 +836,9 @@ def test_alloc_memory_type_invalid(sample_graphdef):
         pytest.param(lambda d: d, id="Device_object"),
     ],
 )
-def test_alloc_device_option(sample_graphdef, device_spec):
+def test_alloc_device_option(init_cuda, device_spec):
     """Device can be specified as int or Device object."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     device = Device()
     with xfail_on_graph_mempool_oom(device):
@@ -868,8 +861,9 @@ def test_alloc_peer_access(mempool_device_x2):
 
 
 @pytest.mark.parametrize("num_branches", [2, 3, 5])
-def test_join_merges_branches(sample_graphdef, num_branches):
+def test_join_merges_branches(init_cuda, num_branches):
     """join() with multiple branches creates correct dependencies."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)]
@@ -883,8 +877,9 @@ def test_join_merges_branches(sample_graphdef, num_branches):
 # =============================================================================
 
 
-def test_launch_creates_node(sample_graphdef):
+def test_launch_creates_node(init_cuda):
     """launch() creates a KernelNode."""
+    sample_graphdef = GraphDefinition()
     mod = compile_common_kernels()
     kernel = mod.get_kernel("empty_kernel")
     config = LaunchConfig(grid=1, block=1)
@@ -892,8 +887,9 @@ def test_launch_creates_node(sample_graphdef):
     assert isinstance(node, KernelNode)
 
 
-def test_launch_chain_dependencies(sample_graphdef):
+def test_launch_chain_dependencies(init_cuda):
     """Chained launches create correct dependencies."""
+    sample_graphdef = GraphDefinition()
     mod = compile_common_kernels()
     kernel = mod.get_kernel("empty_kernel")
     config = LaunchConfig(grid=1, block=1)
@@ -955,15 +951,17 @@ def _instantiate_and_upload(graph_definition, kwargs, stream):
 
 
 @pytest.mark.parametrize("inst_kwargs", _INSTANTIATE_ONLY_OPTIONS)
-def test_instantiate_empty_graph(sample_graphdef, inst_kwargs):
+def test_instantiate_empty_graph(init_cuda, inst_kwargs):
     """Empty graph can be instantiated."""
+    sample_graphdef = GraphDefinition()
     graph = _instantiate(sample_graphdef, inst_kwargs)
     assert graph is not None
 
 
 @pytest.mark.parametrize("inst_kwargs", _INSTANTIATE_ONLY_OPTIONS)
-def test_instantiate_with_nodes(sample_graphdef, inst_kwargs):
+def test_instantiate_with_nodes(init_cuda, inst_kwargs):
     """Graph with nodes can be instantiated."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         sample_graphdef.allocate(ALLOC_SIZE)
@@ -973,8 +971,9 @@ def test_instantiate_with_nodes(sample_graphdef, inst_kwargs):
 
 
 @pytest.mark.skipif(not Device(0).properties.unified_addressing, reason="requires unified addressing")
-def test_instantiate_and_execute_kernel_device_launch(sample_graphdef):
+def test_instantiate_and_execute_kernel_device_launch(init_cuda):
     """Kernel-only graph can be instantiated with device_launch flag."""
+    sample_graphdef = GraphDefinition()
     mod = compile_common_kernels()
     kernel = mod.get_kernel("empty_kernel")
     config = LaunchConfig(grid=1, block=1)
@@ -990,8 +989,9 @@ def test_instantiate_and_execute_kernel_device_launch(sample_graphdef):
 
 
 @pytest.mark.parametrize("inst_kwargs", _EXECUTE_OPTIONS)
-def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs):
+def test_instantiate_and_execute_kernel(init_cuda, inst_kwargs):
     """Graph with kernel can be instantiated and executed."""
+    sample_graphdef = GraphDefinition()
     mod = compile_common_kernels()
     kernel = mod.get_kernel("empty_kernel")
     config = LaunchConfig(grid=1, block=1)
@@ -1004,8 +1004,9 @@ def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs):
 
 
 @pytest.mark.parametrize("inst_kwargs", _EXECUTE_OPTIONS)
-def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
+def test_instantiate_and_execute_alloc_free(init_cuda, inst_kwargs):
     """Graph with alloc/free can be executed."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         alloc = sample_graphdef.allocate(ALLOC_SIZE)
@@ -1018,8 +1019,9 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
 
 
 @pytest.mark.parametrize("inst_kwargs", _EXECUTE_OPTIONS)
-def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs):
+def test_instantiate_and_execute_memset(init_cuda, inst_kwargs):
     """Graph with alloc/memset/free can be executed."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         alloc = sample_graphdef.allocate(ALLOC_SIZE)
@@ -1033,8 +1035,9 @@ def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs):
 
 
 @pytest.mark.parametrize("inst_kwargs", _EXECUTE_OPTIONS)
-def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs):
+def test_instantiate_and_execute_memcpy(init_cuda, inst_kwargs):
     """Graph with alloc/memset/memcpy/free can be executed and data is copied."""
+    sample_graphdef = GraphDefinition()
     _skip_if_no_mempool()
     import ctypes
 
@@ -1058,8 +1061,9 @@ def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs):
     assert all(b == 0xAB for b in host_buf)
 
 
-def test_instantiate_and_execute_child_graph(sample_graphdef):
+def test_instantiate_and_execute_child_graph(init_cuda):
     """Graph with embedded child graph can be executed."""
+    sample_graphdef = GraphDefinition()
     child = GraphDefinition()
     mod = compile_common_kernels()
     kernel = mod.get_kernel("empty_kernel")
@@ -1075,8 +1079,9 @@ def test_instantiate_and_execute_child_graph(sample_graphdef):
     stream.sync()
 
 
-def test_instantiate_and_execute_host_callback(sample_graphdef):
+def test_instantiate_and_execute_host_callback(init_cuda):
     """Graph with host callback can be executed and callback is invoked."""
+    sample_graphdef = GraphDefinition()
     results = []
 
     def my_callback():
@@ -1093,8 +1098,9 @@ def my_callback():
     assert results == [42]
 
 
-def test_instantiate_and_execute_host_callback_cfunc(sample_graphdef):
+def test_instantiate_and_execute_host_callback_cfunc(init_cuda):
     """Graph with ctypes function pointer callback can be executed."""
+    sample_graphdef = GraphDefinition()
     import ctypes
 
     CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
@@ -1115,8 +1121,9 @@ def raw_fn(data):
     assert called[0]
 
 
-def test_host_callback_cfunc_with_user_data(sample_graphdef):
+def test_host_callback_cfunc_with_user_data(init_cuda):
     """Host callback with bytes user_data passes data to C function."""
+    sample_graphdef = GraphDefinition()
     import ctypes
 
     CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
@@ -1137,14 +1144,16 @@ def read_byte(data):
     assert result[0] == 0xAB
 
 
-def test_host_callback_user_data_rejected_for_python_callable(sample_graphdef):
+def test_host_callback_user_data_rejected_for_python_callable(init_cuda):
     """user_data is rejected for Python callables."""
+    sample_graphdef = GraphDefinition()
     with pytest.raises(ValueError, match="user_data is only supported"):
         sample_graphdef.callback(lambda: None, user_data=b"hello")
 
 
-def test_instantiate_and_execute_event_record_wait(sample_graphdef):
+def test_instantiate_and_execute_event_record_wait(init_cuda):
     """Graph with event record and wait nodes can be executed."""
+    sample_graphdef = GraphDefinition()
     event = Device().create_event()
     rec = sample_graphdef.record(event)
     rec.wait(event)
@@ -1166,8 +1175,9 @@ def _skip_unless_cc_90():
         pytest.skip("Conditional node execution requires CC >= 9.0 (Hopper)")
 
 
-def test_instantiate_and_execute_if_then(sample_graphdef):
+def test_instantiate_and_execute_if_then(init_cuda):
     """If-conditional node: body executes only when condition is non-zero."""
+    sample_graphdef = GraphDefinition()
     _skip_unless_cc_90()
     _skip_if_no_mempool()
     import ctypes
@@ -1199,8 +1209,9 @@ def test_instantiate_and_execute_if_then(sample_graphdef):
     assert result[0] == 1
 
 
-def test_instantiate_and_execute_if_else(sample_graphdef):
+def test_instantiate_and_execute_if_else(init_cuda):
     """If-else node: then or else branch executes based on condition."""
+    sample_graphdef = GraphDefinition()
     _skip_unless_cc_90()
     _skip_if_no_mempool()
     import ctypes
@@ -1234,8 +1245,9 @@ def test_instantiate_and_execute_if_else(sample_graphdef):
     assert result[0] == 2
 
 
-def test_instantiate_and_execute_switch(sample_graphdef):
+def test_instantiate_and_execute_switch(init_cuda):
     """Switch node: selected branch executes based on condition value."""
+    sample_graphdef = GraphDefinition()
     _skip_unless_cc_90()
     _skip_if_no_mempool()
     import ctypes
@@ -1268,8 +1280,9 @@ def test_instantiate_and_execute_switch(sample_graphdef):
     assert result[0] == 1
 
 
-def test_conditional_node_type_preserved_by_nodes(sample_graphdef):
+def test_conditional_node_type_preserved_by_nodes(init_cuda):
     """Conditional nodes appear as ConditionalNode base when read back from graph."""
+    sample_graphdef = GraphDefinition()
     condition = try_create_condition(sample_graphdef)
     if_node = sample_graphdef.if_then(condition)
     assert isinstance(if_node, IfNode)
@@ -1285,8 +1298,10 @@ def test_conditional_node_type_preserved_by_nodes(sample_graphdef):
 # =============================================================================
 
 
-def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
+def test_debug_dot_print_creates_file(init_cuda, tmp_path):
     """debug_dot_print writes a DOT file."""
+    sample_graphdef = GraphDefinition()
+    dot_file = tmp_path / "graph.dot"
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         sample_graphdef.allocate(ALLOC_SIZE)
@@ -1296,8 +1311,10 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
     assert "digraph" in content
 
 
-def test_debug_dot_print_with_options(sample_graphdef, dot_file):
+def test_debug_dot_print_with_options(init_cuda, tmp_path):
     """debug_dot_print accepts GraphDebugPrintOptions."""
+    sample_graphdef = GraphDefinition()
+    dot_file = tmp_path / "graph.dot"
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         sample_graphdef.allocate(ALLOC_SIZE)
@@ -1306,8 +1323,10 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file):
     assert dot_file.exists()
 
 
-def test_debug_dot_print_invalid_options(sample_graphdef, dot_file):
+def test_debug_dot_print_invalid_options(init_cuda, tmp_path):
     """debug_dot_print rejects invalid options type."""
+    sample_graphdef = GraphDefinition()
+    dot_file = tmp_path / "graph.dot"
     _skip_if_no_mempool()
     with xfail_on_graph_mempool_oom():
         sample_graphdef.allocate(ALLOC_SIZE)
diff --git a/cuda_core/tests/graph/test_graph_memory_resource.py b/cuda_core/tests/graph/test_graph_memory_resource.py
index 9fc794f4cca..482e8fe1c57 100644
--- a/cuda_core/tests/graph/test_graph_memory_resource.py
+++ b/cuda_core/tests/graph/test_graph_memory_resource.py
@@ -21,6 +21,13 @@
 from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.graph import GraphCompleteOptions
 
+# NOTE(seberg): "global" mode seems thread-unsafe even when working on stream
+_GRAPH_MODES = [
+    pytest.param("global", marks=pytest.mark.thread_unsafe(reason="gb instances share stream unsafely")),
+    "thread_local",
+    "relaxed",
+]
+
 
 def _common_kernels_alloc():
     code = """
@@ -80,7 +87,7 @@ def free(self, buffers):
         self.stream.sync()
 
 
-@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", ["no_graph"] + _GRAPH_MODES)
 @pytest.mark.parametrize("action", ["incr", "fill"])
 def test_graph_alloc(mempool_device, mode, action):
     """Test basic graph capture with memory allocated and deallocated by
@@ -130,7 +137,7 @@ def apply_kernels(mr, stream, out):
         assert compare_buffer_to_constant(out, 3)
     else:
         # Capture work, then upload and launch.
-        gb = device.create_graph_builder().begin_building(mode)
+        gb = stream.create_graph_builder().begin_building(mode)
         with xfail_on_graph_mempool_oom(device):
             apply_kernels(mr=gmr, stream=gb, out=out)
             graph = gb.end_building().complete()
@@ -150,7 +157,7 @@ def apply_kernels(mr, stream, out):
 
 
 @pytest.mark.skipif(IS_WINDOWS or IS_WSL, reason="auto_free_on_launch not supported on Windows")
-@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", _GRAPH_MODES)
 def test_graph_alloc_with_output(mempool_device, mode):
     """Test for memory allocated in a graph being used outside the graph."""
     NBYTES = 64
@@ -168,7 +175,7 @@ def test_graph_alloc_with_output(mempool_device, mode):
     # Construct a graph to copy and increment the input. It returns a new
     # buffer allocated within the graph.  The auto_free_on_launch option
     # is required to properly use the output buffer.
-    gb = device.create_graph_builder().begin_building(mode)
+    gb = stream.create_graph_builder().begin_building(mode)
     with xfail_on_graph_mempool_oom(device):
         out = gmr.allocate(NBYTES, stream=gb)
         out.copy_from(in_, stream=gb)
@@ -195,7 +202,8 @@ def test_graph_alloc_with_output(mempool_device, mode):
     assert compare_buffer_to_constant(out, 6)
 
 
-@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", _GRAPH_MODES)
+@pytest.mark.thread_unsafe(reason="gb instances share default stream")
 def test_graph_mem_alloc_zero(mempool_device, mode):
     device = mempool_device
     gb = device.create_graph_builder().begin_building(mode)
@@ -213,7 +221,8 @@ def test_graph_mem_alloc_zero(mempool_device, mode):
     assert buffer.device_id == int(device)
 
 
-@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", _GRAPH_MODES)
+@pytest.mark.thread_unsafe(reason="GMR is shared, so high mark is global")
 def test_graph_mem_set_attributes(mempool_device, mode):
     device = mempool_device
     stream = device.create_stream()
@@ -265,7 +274,7 @@ def test_graph_mem_set_attributes(mempool_device, mode):
     mman.reset()
 
 
-@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", _GRAPH_MODES)
 def test_gmr_check_capture_state(mempool_device, mode):
     """
     Test expected errors (and non-errors) using GraphMemoryResource with graph
@@ -284,7 +293,7 @@ def test_gmr_check_capture_state(mempool_device, mode):
         gmr.allocate(1, stream=stream)
 
     # Capturing
-    gb = device.create_graph_builder().begin_building(mode=mode)
+    gb = stream.create_graph_builder().begin_building(mode=mode)
     with xfail_on_graph_mempool_oom(device):
         gmr.allocate(1, stream=gb)  # no error
         gb.end_building().complete()
@@ -320,7 +329,7 @@ def test_graph_memory_resource_attributes_repr(mempool_device):
     assert "used_mem_high=" in r
 
 
-@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+@pytest.mark.parametrize("mode", _GRAPH_MODES)
 def test_dmr_check_capture_state(mempool_device, mode):
     """
     Test expected errors (and non-errors) using DeviceMemoryResource with graph
@@ -334,7 +343,7 @@ def test_dmr_check_capture_state(mempool_device, mode):
     dmr.allocate(1, stream=stream).close()  # no error
 
     # Capturing
-    gb = device.create_graph_builder().begin_building(mode=mode)
+    gb = stream.create_graph_builder().begin_building(mode=mode)
     with pytest.raises(
         RuntimeError,
         match=r"cannot perform memory operations on a capturing "

From 58e7f2f742f666d7b80eec846249f86d4956675e Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:41:12 +0200
Subject: [PATCH 09/17] TST: Fixup memory tests, mostly work around issue when
 tearing down mempool

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/tests/memory_ipc/test_peer_access.py |  4 ++++
 .../tests/memory_ipc/test_send_buffers.py      |  5 +++++
 cuda_core/tests/memory_ipc/test_workerpool.py  | 18 +++++++++++++++---
 cuda_core/tests/test_memory.py                 | 15 ++++++++++++---
 4 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index efb67b4cdb8..ac7f71a88e9 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -92,6 +92,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent):
         assert process.exitcode == 0
 
         buffer.close()
+        # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+        dev1.sync()
         mr.close()
 
     def child_main(self, mr, buffer):
@@ -129,4 +131,6 @@ def child_main(self, mr, buffer):
             PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False)
 
         buffer.close()
+        # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+        dev1.sync()
         mr.close()
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 01c9496e773..59216cd9cce 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -29,6 +29,7 @@ def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
+        buffers = []
 
         try:
             # Allocate and fill memory.
@@ -54,6 +55,10 @@ def test_main(self, ipc_device, nmrs):
                 pgen.verify_buffer(buffer, seed=True)
                 buffer.close()
         finally:
+            for buffer in buffers:
+                buffer.close()
+            # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+            device.sync()
             for mr in mrs:
                 mr.close()
 
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 08d9bd79d92..358c16fd7bf 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -35,6 +35,7 @@ def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
+        buffers = []
 
         try:
             buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -45,8 +46,11 @@ def test_main(self, ipc_device, nmrs):
             pgen = PatternGen(device, NBYTES)
             for buffer in buffers:
                 pgen.verify_buffer(buffer, seed=True)
-                buffer.close()
         finally:
+            for buffer in buffers:
+                buffer.close()
+            # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+            device.sync()
             for mr in mrs:
                 mr.close()
 
@@ -77,6 +81,7 @@ def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
+        buffers = []
 
         try:
             buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -90,8 +95,11 @@ def test_main(self, ipc_device, nmrs):
             pgen = PatternGen(device, NBYTES)
             for buffer in buffers:
                 pgen.verify_buffer(buffer, seed=True)
-                buffer.close()
         finally:
+            for buffer in buffers:
+                buffer.close()
+            # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+            device.sync()
             for mr in mrs:
                 mr.close()
 
@@ -127,6 +135,7 @@ def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
+        buffers = []
 
         try:
             buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -137,8 +146,11 @@ def test_main(self, ipc_device, nmrs):
             pgen = PatternGen(device, NBYTES)
             for buffer in buffers:
                 pgen.verify_buffer(buffer, seed=True)
-                buffer.close()
         finally:
+            for buffer in buffers:
+                buffer.close()
+            # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+            device.sync()
             for mr in mrs:
                 mr.close()
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 35592485c94..1c90d75fd75 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -251,9 +251,8 @@ def _pattern_bytes(value) -> bytes:
 
 
 @pytest.fixture(params=["device", "unified", "pinned"])
-def fill_env(request):
-    device = Device()
-    device.set_current()
+def fill_env(request, init_cuda):
+    device = init_cuda
     if request.param == "device":
         mr = DummyDeviceMemoryResource(device)
     elif request.param == "unified":
@@ -1103,6 +1102,8 @@ def test_device_memory_resource_with_options(init_cuda):
     device.sync()
     dst_buffer.close()
     src_buffer.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    device.sync()
 
 
 def test_pinned_memory_resource_with_options(init_cuda):
@@ -1149,6 +1150,8 @@ def test_pinned_memory_resource_with_options(init_cuda):
     device.sync()
     dst_buffer.close()
     src_buffer.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    device.sync()
 
 
 def test_managed_memory_resource_with_options(init_cuda):
@@ -1365,6 +1368,8 @@ def test_mempool_ipc_errors(mempool_device):
         Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream)
 
     buffer.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    device.sync()
 
 
 def test_pinned_mempool_ipc_basic():
@@ -1405,6 +1410,8 @@ def test_pinned_mempool_ipc_basic():
     assert ipc_desc.size == 1024
 
     buffer.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    device.sync()
     mr.close()
 
 
@@ -1436,6 +1443,8 @@ def test_pinned_mempool_ipc_errors():
         Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream)
 
     buffer.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    device.sync()
     mr.close()
 
 

From a5988c4233757880551f3f34290a38a6494b2479 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:42:38 +0200
Subject: [PATCH 10/17] TST: Thread unsafe markers for test_managed_ops

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/tests/memory/test_managed_ops.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py
index b1f0c74664a..89917041c93 100644
--- a/cuda_core/tests/memory/test_managed_ops.py
+++ b/cuda_core/tests/memory/test_managed_ops.py
@@ -345,6 +345,7 @@ def test_from_handle(self, init_cuda):
         finally:
             plain.close()
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_read_mostly_roundtrip(self, external_managed_buffer):
         buf = external_managed_buffer
         assert buf.read_mostly is False
@@ -353,6 +354,7 @@ def test_read_mostly_roundtrip(self, external_managed_buffer):
         buf.read_mostly = False
         assert buf.read_mostly is False
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_preferred_location_roundtrip(self, location_ops_device, external_managed_buffer):
         device = location_ops_device
         buf = external_managed_buffer
@@ -367,6 +369,7 @@ def test_preferred_location_roundtrip(self, location_ops_device, external_manage
         buf.preferred_location = None
         assert buf.preferred_location is None
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_preferred_location_roundtrip_host_numa(self, location_ops_device):
         """Host(numa_id=N) round-trips correctly on CUDA 13 builds."""
         from cuda.core._utils.version import binding_version
@@ -387,6 +390,7 @@ def test_preferred_location_roundtrip_host_numa(self, location_ops_device):
         finally:
             plain.close()
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_accessed_by_add_discard(self, location_ops_device, external_managed_buffer):
         device = location_ops_device
         buf = external_managed_buffer
@@ -398,6 +402,7 @@ def test_accessed_by_add_discard(self, location_ops_device, external_managed_buf
         buf.accessed_by.discard(device)
         assert device not in buf.accessed_by
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_accessed_by_mutable_set_interface(self, location_ops_device, external_managed_buffer):
         """Full MutableSet conformance pass on AccessedBySetProxy.
 
@@ -417,6 +422,7 @@ def test_accessed_by_mutable_set_interface(self, location_ops_device, external_m
             non_member=Host(numa_id=0),
         )
 
+    @pytest.mark.thread_unsafe(reason="external_managed_buffer is shared between threads")
     def test_accessed_by_set_assignment(self, location_ops_device, external_managed_buffer):
         device = location_ops_device
         buf = external_managed_buffer

From 24751281283ccd7c9b7384c453861de9b3611c31 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:43:24 +0200
Subject: [PATCH 11/17] Avoid interactive backend when using run_tests.sh
 locally

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/test_examples.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/tests/test_examples.py b/cuda_bindings/tests/test_examples.py
index 0c3efe72811..bd7b51434fd 100644
--- a/cuda_bindings/tests/test_examples.py
+++ b/cuda_bindings/tests/test_examples.py
@@ -20,6 +20,7 @@ def test_example(example):
 
     env = os.environ.copy()
     env["CUDA_BINDINGS_SKIP_EXAMPLE"] = "100"
+    env["MPLBACKEND"] = "Agg"  # avoid plt.show() from blocking
 
     process = subprocess.run([sys.executable, example], capture_output=True, env=env)  # noqa: S603
     # returncode is a special value used in the examples to indicate that system requirements are not met.

From 4fe255ece89e6cee8ccba1ed350985741bca99ea Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 22:13:22 +0200
Subject: [PATCH 12/17] Use indirect fixtures for a nicer pattern and avoid
 thread issues

After my first AI try was a crazy mess, the second run actually found
a neat solution...
These objects can be created in the main thread, but we can't create
them on the fly in many threads as it was...

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/tests/test_object_protocols.py | 166 ++++++++++++-----------
 1 file changed, 85 insertions(+), 81 deletions(-)

diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py
index d1085a952bb..e4e8ee21c9b 100644
--- a/cuda_core/tests/test_object_protocols.py
+++ b/cuda_core/tests/test_object_protocols.py
@@ -233,7 +233,11 @@ def sample_ipc_buffer_descriptor(ipc_device):
     options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
     mr = DeviceMemoryResource(ipc_device, options=options)
     buf = mr.allocate(64, stream=ipc_device.default_stream)
-    return buf.ipc_descriptor
+    descriptor = buf.ipc_descriptor
+    buf.close()
+    # TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
+    ipc_device.sync()
+    return descriptor
 
 
 @pytest.fixture
@@ -523,6 +527,26 @@ def sample_switch_node_alt(sample_graphdef):
     return sample_graphdef.switch(condition, 3)
 
 
+# Indirect-parametrize helpers: request.getfixturevalue() runs here, in the
+# fixture (main thread), so the resolved object is already available when the
+# test function runs in a worker thread.
+
+
+@pytest.fixture
+def sample_object(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture
+def sample_object_a(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture
+def sample_object_b(request):
+    return request.getfixturevalue(request.param)
+
+
 # =============================================================================
 # Type groupings
 # =============================================================================
@@ -718,12 +742,11 @@ def sample_switch_node_alt(sample_graphdef):
 # =============================================================================
 
 
-@pytest.mark.parametrize("fixture_name", WEAKREF_TYPES)
-def test_weakref_supported(fixture_name, request):
+@pytest.mark.parametrize("sample_object", WEAKREF_TYPES, indirect=True)
+def test_weakref_supported(sample_object):
     """Object supports weak references."""
-    obj = request.getfixturevalue(fixture_name)
-    ref = weakref.ref(obj)
-    assert ref() is obj
+    ref = weakref.ref(sample_object)
+    assert ref() is sample_object
 
 
 # =============================================================================
@@ -731,27 +754,22 @@ def test_weakref_supported(fixture_name, request):
 # =============================================================================
 
 
-@pytest.mark.parametrize("fixture_name", HASH_TYPES)
-def test_hash_consistency(fixture_name, request):
+@pytest.mark.parametrize("sample_object", HASH_TYPES, indirect=True)
+def test_hash_consistency(sample_object):
     """Hash is consistent across multiple calls."""
-    obj = request.getfixturevalue(fixture_name)
-    assert hash(obj) == hash(obj)
+    assert hash(sample_object) == hash(sample_object)
 
 
-@pytest.mark.parametrize("a_name,b_name", SAME_TYPE_PAIRS)
-def test_hash_distinct_same_type(a_name, b_name, request):
+@pytest.mark.parametrize("sample_object_a,sample_object_b", SAME_TYPE_PAIRS, indirect=True)
+def test_hash_distinct_same_type(sample_object_a, sample_object_b):
     """Distinct objects of the same type have different hashes."""
-    obj_a = request.getfixturevalue(a_name)
-    obj_b = request.getfixturevalue(b_name)
-    assert hash(obj_a) != hash(obj_b)  # extremely unlikely
+    assert hash(sample_object_a) != hash(sample_object_b)  # extremely unlikely
 
 
-@pytest.mark.parametrize("a_name,b_name", itertools.combinations(HASH_TYPES, 2))
-def test_hash_distinct_cross_type(a_name, b_name, request):
+@pytest.mark.parametrize("sample_object_a,sample_object_b", itertools.combinations(HASH_TYPES, 2), indirect=True)
+def test_hash_distinct_cross_type(sample_object_a, sample_object_b):
     """Distinct objects of different types have different hashes."""
-    obj_a = request.getfixturevalue(a_name)
-    obj_b = request.getfixturevalue(b_name)
-    assert hash(obj_a) != hash(obj_b)  # extremely unlikely
+    assert hash(sample_object_a) != hash(sample_object_b)  # extremely unlikely
 
 
 # =============================================================================
@@ -759,41 +777,35 @@ def test_hash_distinct_cross_type(a_name, b_name, request):
 # =============================================================================
 
 
-@pytest.mark.parametrize("fixture_name", EQ_TYPES)
-def test_equality_basic(fixture_name, request):
+@pytest.mark.parametrize("sample_object", EQ_TYPES, indirect=True)
+def test_equality_basic(sample_object):
     """Object equality: reflexive, not equal to None or other types."""
-    obj = request.getfixturevalue(fixture_name)
-    assert obj == obj
-    assert obj is not None
-    assert obj != "string"
-    if hasattr(obj, "handle"):
-        assert obj != obj.handle
+    assert sample_object == sample_object
+    assert sample_object is not None
+    assert sample_object != "string"
+    if hasattr(sample_object, "handle"):
+        assert sample_object != sample_object.handle
 
 
-@pytest.mark.parametrize("a_name,b_name", itertools.combinations(EQ_TYPES, 2))
-def test_no_cross_type_equality(a_name, b_name, request):
+@pytest.mark.parametrize("sample_object_a,sample_object_b", itertools.combinations(EQ_TYPES, 2), indirect=True)
+def test_no_cross_type_equality(sample_object_a, sample_object_b):
     """No two distinct objects of different types should compare equal."""
-    obj_a = request.getfixturevalue(a_name)
-    obj_b = request.getfixturevalue(b_name)
-    assert obj_a != obj_b
+    assert sample_object_a != sample_object_b
 
 
-@pytest.mark.parametrize("a_name,b_name", SAME_TYPE_PAIRS)
-def test_same_type_inequality(a_name, b_name, request):
+@pytest.mark.parametrize("sample_object_a,sample_object_b", SAME_TYPE_PAIRS, indirect=True)
+def test_same_type_inequality(sample_object_a, sample_object_b):
     """Two distinct objects of the same type should not compare equal."""
-    obj_a = request.getfixturevalue(a_name)
-    obj_b = request.getfixturevalue(b_name)
-    assert obj_a is not obj_b
-    assert obj_a != obj_b
+    assert sample_object_a is not sample_object_b
+    assert sample_object_a != sample_object_b
 
 
-@pytest.mark.parametrize("fixture_name,copy_fn", FROM_HANDLE_COPIES)
-def test_equality_same_handle(fixture_name, copy_fn, request):
+@pytest.mark.parametrize("sample_object,copy_fn", FROM_HANDLE_COPIES, indirect=["sample_object"])
+def test_equality_same_handle(sample_object, copy_fn):
     """Two wrappers around the same handle should compare equal."""
-    obj = request.getfixturevalue(fixture_name)
-    obj2 = copy_fn(obj)
-    assert obj == obj2
-    assert hash(obj) == hash(obj2)
+    obj2 = copy_fn(sample_object)
+    assert sample_object == obj2
+    assert hash(sample_object) == hash(obj2)
 
 
 # =============================================================================
@@ -801,48 +813,43 @@ def test_equality_same_handle(fixture_name, copy_fn, request):
 # =============================================================================
 
 
-@pytest.mark.parametrize("fixture_name", DICT_KEY_TYPES)
-def test_usable_as_dict_key(fixture_name, request):
+@pytest.mark.parametrize("sample_object", DICT_KEY_TYPES, indirect=True)
+def test_usable_as_dict_key(sample_object):
     """Object can be used as a dictionary key."""
-    obj = request.getfixturevalue(fixture_name)
-    d = {obj: "value"}
-    assert d[obj] == "value"
-    assert obj in d
+    d = {sample_object: "value"}
+    assert d[sample_object] == "value"
+    assert sample_object in d
 
 
-@pytest.mark.parametrize("fixture_name", DICT_KEY_TYPES)
-def test_usable_in_set(fixture_name, request):
+@pytest.mark.parametrize("sample_object", DICT_KEY_TYPES, indirect=True)
+def test_usable_in_set(sample_object):
     """Object can be added to a set."""
-    obj = request.getfixturevalue(fixture_name)
-    s = {obj}
-    assert obj in s
+    s = {sample_object}
+    assert sample_object in s
 
 
-@pytest.mark.parametrize("fixture_name", WEAKREF_TYPES)
-def test_usable_in_weak_value_dict(fixture_name, request):
+@pytest.mark.parametrize("sample_object", WEAKREF_TYPES, indirect=True)
+def test_usable_in_weak_value_dict(sample_object):
     """Object can be used as a WeakValueDictionary value."""
-    obj = request.getfixturevalue(fixture_name)
     wvd = weakref.WeakValueDictionary()
-    wvd["key"] = obj
-    assert wvd["key"] is obj
+    wvd["key"] = sample_object
+    assert wvd["key"] is sample_object
 
 
-@pytest.mark.parametrize("fixture_name", WEAK_KEY_TYPES)
-def test_usable_in_weak_key_dict(fixture_name, request):
+@pytest.mark.parametrize("sample_object", WEAK_KEY_TYPES, indirect=True)
+def test_usable_in_weak_key_dict(sample_object):
     """Object can be used as a WeakKeyDictionary key."""
-    obj = request.getfixturevalue(fixture_name)
     wkd = weakref.WeakKeyDictionary()
-    wkd[obj] = "value"
-    assert wkd[obj] == "value"
+    wkd[sample_object] = "value"
+    assert wkd[sample_object] == "value"
 
 
-@pytest.mark.parametrize("fixture_name", WEAK_KEY_TYPES)
-def test_usable_in_weak_set(fixture_name, request):
+@pytest.mark.parametrize("sample_object", WEAK_KEY_TYPES, indirect=True)
+def test_usable_in_weak_set(sample_object):
     """Object can be added to a WeakSet."""
-    obj = request.getfixturevalue(fixture_name)
     ws = weakref.WeakSet()
-    ws.add(obj)
-    assert obj in ws
+    ws.add(sample_object)
+    assert sample_object in ws
 
 
 # =============================================================================
@@ -850,12 +857,10 @@ def test_usable_in_weak_set(fixture_name, request):
 # =============================================================================
 
 
-@pytest.mark.parametrize("fixture_name,pattern", REPR_PATTERNS)
-def test_repr_format(fixture_name, pattern, request):
+@pytest.mark.parametrize("sample_object,pattern", REPR_PATTERNS, indirect=["sample_object"])
+def test_repr_format(sample_object, pattern):
     """repr() returns a properly formatted string."""
-    obj = request.getfixturevalue(fixture_name)
-    result = repr(obj)
-    assert re.fullmatch(pattern, result)
+    assert re.fullmatch(pattern, repr(sample_object))
 
 
 # =============================================================================
@@ -864,10 +869,9 @@ def test_repr_format(fixture_name, pattern, request):
 
 
 @pytest.mark.parametrize("pickle_module", PICKLE_MODULES)
-@pytest.mark.parametrize("fixture_name", PICKLE_TYPES)
-def test_pickle_roundtrip(fixture_name, pickle_module, request):
+@pytest.mark.parametrize("sample_object", PICKLE_TYPES, indirect=True)
+def test_pickle_roundtrip(sample_object, pickle_module):
     """Object survives a pickle/cloudpickle roundtrip."""
     mod = pytest.importorskip(pickle_module)
-    obj = request.getfixturevalue(fixture_name)
-    result = mod.loads(mod.dumps(obj))
-    assert type(result) is type(obj)
+    result = mod.loads(mod.dumps(sample_object))
+    assert type(result) is type(sample_object)

From 831eefb48165de610f9ca14ff9e0bddcb3467b3f Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 11 Jun 2026 14:46:39 +0200
Subject: [PATCH 13/17] Make latch-kernel helper compile only once

For some reason the latch kernel helper test started failing now
(it did not before my update from CUDA 13.2 to 13.3?).

The reason isn't that it is not thread-safe, but that something
(presumably module loading/unloading) causes synchronizations which
in turn cause threads having to wait on their LatchKernel to finish.

And of course the test itself really needs that not to happen.
Making sure there is only one LatchKernel compiled and loaded exactly
once seems to avoid this problem.

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/tests/helpers/latch.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/helpers/latch.py b/cuda_core/tests/helpers/latch.py
index c28fb222641..978e2dbdf18 100644
--- a/cuda_core/tests/helpers/latch.py
+++ b/cuda_core/tests/helpers/latch.py
@@ -1,7 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import ctypes
+import threading
 
 import pytest
 
@@ -20,9 +21,15 @@ class LatchKernel:
     Manages a kernel that blocks stream progress until released.
     """
 
-    def __init__(self, device, timeout_sec=60):
-        if helpers.CUDA_INCLUDE_PATH is None:
-            pytest.skip("need CUDA header")
+    _latch_kernel_lock = threading.Lock()
+    _latch_kernels = {}
+
+    @classmethod
+    def _get_kernel(cls, device):
+        kernel = cls._latch_kernels.get(device.uuid)
+        if kernel is not None:
+            return kernel
+
         code = """
                #include <cuda/atomic>
 
@@ -41,6 +48,7 @@ def __init__(self, device, timeout_sec=60):
 
                        // Check for timeout
                        if (clock64() - start >= timeout_cycles) {
+                           signal.store(-1, cuda::memory_order_relaxed);
                            break;  // Timeout reached
                        }
 
@@ -56,14 +64,25 @@ def __init__(self, device, timeout_sec=60):
         )
         prog = Program(code, code_type="c++", options=program_options)
         mod = prog.compile(target_type="cubin")
-        self.kernel = mod.get_kernel("latch")
+        kernel = mod.get_kernel("latch")
+
+        return cls._latch_kernels.setdefault(device.uuid, kernel)
+
+    def __init__(self, device, timeout_sec=60):
+        if helpers.CUDA_INCLUDE_PATH is None:
+            pytest.skip("need CUDA header")
+
+        with self._latch_kernel_lock:
+            self.kernel = self._get_kernel(device)
 
         mr = LegacyPinnedMemoryResource()
         self.buffer = mr.allocate(4)
-        self.busy_wait_flag[0] = 0
+        self.busy_wait_flag[0] = 1
         clock_rate_hz = device.properties.clock_rate * 1000
         self.timeout_cycles = int(timeout_sec * clock_rate_hz)
 
+        self.busy_wait_flag[0] = 0
+
     def launch(self, stream):
         """Launch the latch kernel, blocking stream progress via busy waiting."""
         config = LaunchConfig(grid=1, block=1)

From f6d90736aedc6d5eb49804b157dee0c1c5257cd2 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 11 Jun 2026 15:04:37 +0200
Subject: [PATCH 14/17] Update pyi files (although I find it strange to include
 @cython.critical_section

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyi                | 2 ++
 cuda_core/cuda/core/_memory/_graph_memory_resource.pyi | 3 ---
 cuda_core/cuda/core/_memory/_memory_pool.pyi           | 2 ++
 cuda_core/cuda/core/_module.pyi                        | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyi b/cuda_core/cuda/core/_memory/_buffer.pyi
index 728853c4bc7..7118a3a1e07 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyi
+++ b/cuda_core/cuda/core/_memory/_buffer.pyi
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import cython
 from cuda.core._memory._device_memory_resource import DeviceMemoryResource
 from cuda.core._memory._ipc import IPCBufferDescriptor
 from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource
@@ -88,6 +89,7 @@ class Buffer:
         """
 
     @property
+    @cython.critical_section
     def ipc_descriptor(self) -> IPCBufferDescriptor:
         """Descriptor for sharing this buffer with other processes."""
 
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
index 4ff85eb5972..b34f968fdc9 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from functools import cache
-
 from cuda.core._device import Device
 from cuda.core._memory._buffer import Buffer, MemoryResource
 from cuda.core._stream import Stream
@@ -113,7 +111,6 @@ class GraphMemoryResource(cyGraphMemoryResource):
         ...
 
     @classmethod
-    @cache
     def _create(cls, device_id: int) -> GraphMemoryResource:
         ...
 __all__ = ['GraphMemoryResource']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyi b/cuda_core/cuda/core/_memory/_memory_pool.pyi
index 3d15d9f679f..7f8c64aedda 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyi
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyi
@@ -4,6 +4,7 @@ from __future__ import annotations
 
 import uuid
 
+import cython
 from cuda.core._memory._buffer import Buffer, MemoryResource
 from cuda.core._stream import Stream
 from cuda.core.graph import GraphBuilder
@@ -97,6 +98,7 @@ class _MemPool(MemoryResource):
         """
 
     @property
+    @cython.critical_section
     def attributes(self) -> _MemPoolAttributes:
         """Memory pool attributes."""
 
diff --git a/cuda_core/cuda/core/_module.pyi b/cuda_core/cuda/core/_module.pyi
index caf6b09b717..5125b99131a 100644
--- a/cuda_core/cuda/core/_module.pyi
+++ b/cuda_core/cuda/core/_module.pyi
@@ -5,6 +5,7 @@ from __future__ import annotations
 from collections import namedtuple
 from os import PathLike
 
+import cython
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig
 from cuda.core._stream import Stream
@@ -253,6 +254,7 @@ class Kernel:
         ...
 
     @property
+    @cython.critical_section
     def attributes(self) -> KernelAttributes:
         """Get the read-only attributes of this kernel."""
 
@@ -265,6 +267,7 @@ class Kernel:
         """list[ParamInfo]: (offset, size) for each argument of this function"""
 
     @property
+    @cython.critical_section
     def occupancy(self) -> KernelOccupancy:
         """Get the occupancy information for launching this kernel."""
 

From 4e636a05795b150ec8aad39f27578cc14c47220b Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 11 Jun 2026 16:00:52 +0200
Subject: [PATCH 15/17] TST: Remove XFAIL(strict) in cufile on CI (it seems to
 pass now...)

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/test_cufile.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index ee0995234b3..1c92acd3c32 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -117,12 +117,6 @@ def get_tegra_kind():
     ),
 ]
 
-xfail_handle_register = pytest.mark.xfail(
-    condition=isSupportedFilesystem() and os.environ.get("CI") is not None,
-    raises=cufile.cuFileError,
-    reason="handle_register call fails in CI for unknown reasons",
-)
-
 
 def test_cufile_success_defined():
     """Check if CUFILE_SUCCESS is defined in OpError enum."""
@@ -203,7 +197,6 @@ def driver(ctx):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_handle_register(tmpdir):
     """Test file handle registration with cuFile."""
     # Create test file
@@ -394,7 +387,6 @@ def test_buf_register_already_registered():
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_cufile_read_write(tmpdir):
     """Test cuFile read and write operations."""
     # Create test file
@@ -479,7 +471,6 @@ def test_cufile_read_write(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_cufile_read_write_host_memory(tmpdir):
     """Test cuFile read and write operations using host memory."""
     # Create test file
@@ -560,7 +551,6 @@ def test_cufile_read_write_host_memory(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_cufile_read_write_large(tmpdir):
     """Test cuFile read and write operations with large data."""
     # Create test file
@@ -648,7 +638,6 @@ def test_cufile_read_write_large(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
-@xfail_handle_register
 def test_cufile_write_async(tmpdir):
     """Test cuFile asynchronous write operations."""
     # Create test file
@@ -724,7 +713,6 @@ def test_cufile_write_async(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
-@xfail_handle_register
 def test_cufile_read_async(tmpdir):
     """Test cuFile asynchronous read operations."""
     # Create test file
@@ -812,7 +800,6 @@ def test_cufile_read_async(tmpdir):
 
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
-@xfail_handle_register
 @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
 def test_cufile_async_read_write(tmpdir):
     """Test cuFile asynchronous read and write operations in sequence."""
@@ -925,7 +912,6 @@ def test_cufile_async_read_write(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_batch_io_basic(tmpdir):
     """Test basic batch IO operations with multiple read/write operations."""
     # Create test file
@@ -1122,7 +1108,6 @@ def test_batch_io_basic(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_batch_io_cancel(tmpdir):
     """Test batch IO cancellation."""
     # Create test file
@@ -1200,7 +1185,6 @@ def test_batch_io_cancel(tmpdir):
 
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("driver")
-@xfail_handle_register
 def test_batch_io_large_operations(tmpdir):
     """Test batch IO with large buffer operations."""
     # Create test file
@@ -1602,7 +1586,6 @@ def test_stats_start_stop():
 )
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
-@xfail_handle_register
 @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
 def test_get_stats_l1(tmpdir):
     """Test cuFile L1 statistics retrieval with file operations."""
@@ -1681,7 +1664,6 @@ def test_get_stats_l1(tmpdir):
 )
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
-@xfail_handle_register
 @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
 def test_get_stats_l2(tmpdir):
     """Test cuFile L2 statistics retrieval with file operations."""
@@ -1764,7 +1746,6 @@ def test_get_stats_l2(tmpdir):
 )
 @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
 @pytest.mark.usefixtures("stats")
-@xfail_handle_register
 @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
 def test_get_stats_l3(tmpdir):
     """Test cuFile L3 statistics retrieval with file operations."""

From eb6a2ff4313b3483f16c40366098708ebb1ac125 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 11 Jun 2026 16:01:33 +0200
Subject: [PATCH 16/17] Install pytest-run-parallel explicitly in CI

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 .github/workflows/test-wheel-linux.yml   | 4 ++++
 .github/workflows/test-wheel-windows.yml | 4 ++++
 cuda_bindings/pyproject.toml             | 1 -
 cuda_core/pyproject.toml                 | 1 -
 cuda_pathfinder/pyproject.toml           | 1 -
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index fc22514fe47..0f0c265a779 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -283,6 +283,10 @@ jobs:
           # we use self-hosted runners on which setup-python behaves weirdly (Python include can't be found)...
           AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache"
 
+      - name: Install pytest-run-parallel for free-threaded Python
+        if: endsWith(matrix.PY_VER, 't')
+        run: pip install pytest-run-parallel
+
       - name: Set up mini CTK
         if: ${{ matrix.LOCAL_CTK == '1' }}
         uses: ./.github/actions/fetch_ctk
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 356fb9dd8a8..a67f5a3a729 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -257,6 +257,10 @@ jobs:
         with:
           python-version: ${{ matrix.PY_VER }}
 
+      - name: Install pytest-run-parallel for free-threaded Python
+        if: endsWith(matrix.PY_VER, 't')
+        run: pip install pytest-run-parallel
+
       - name: Verify LongPathsEnabled
         run: |
           $val = (Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem' -Name 'LongPathsEnabled').LongPathsEnabled
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 8e4d481fa9a..d40984d6be4 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -52,7 +52,6 @@ test = [
     "pytest>=6.2.4",
     "pytest-benchmark>=3.4.1",
     "pytest-repeat",
-    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "pytest-randomly",
     "pyglet>=2.1.9",
 ]
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index cedb725ffb1..94424d8488e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -61,7 +61,6 @@ cu13 = ["cuda-bindings[all]==13.*", "cuda-toolkit==13.*"]
 test = [
     "cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark",
     "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout",
-    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "cloudpickle", "psutil",
     # TODO: remove the Python 3.15 guard once 3.15 is officially supported
     "cffi; python_version < '3.15'",
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index c6bb9d98320..32fec0f13d1 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -15,7 +15,6 @@ test = [
     "pytest>=6.2.4",
     "pytest-mock",
     "pytest-repeat",
-    "pytest-run-parallel; python_version == '3.14t' or python_version == '3.15t'",
     "pytest-randomly",
 ]
 # Internal organization of test dependencies.

From 7b59bffb66889a9d0aae7806a8eb2ae174d3a644 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 11 Jun 2026 16:56:23 +0200
Subject: [PATCH 17/17] Move pytest-run-parallel setup (and hopefully actually
 make it work)

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 .github/workflows/test-wheel-linux.yml   |  4 ----
 .github/workflows/test-wheel-windows.yml |  4 ----
 ci/tools/run-tests                       | 25 +++++++++++++-----------
 scripts/run_tests.sh                     |  6 +-----
 4 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 0f0c265a779..fc22514fe47 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -283,10 +283,6 @@ jobs:
           # we use self-hosted runners on which setup-python behaves weirdly (Python include can't be found)...
           AGENT_TOOLSDIRECTORY: "/opt/hostedtoolcache"
 
-      - name: Install pytest-run-parallel for free-threaded Python
-        if: endsWith(matrix.PY_VER, 't')
-        run: pip install pytest-run-parallel
-
       - name: Set up mini CTK
         if: ${{ matrix.LOCAL_CTK == '1' }}
         uses: ./.github/actions/fetch_ctk
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index a67f5a3a729..356fb9dd8a8 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -257,10 +257,6 @@ jobs:
         with:
           python-version: ${{ matrix.PY_VER }}
 
-      - name: Install pytest-run-parallel for free-threaded Python
-        if: endsWith(matrix.PY_VER, 't')
-        run: pip install pytest-run-parallel
-
       - name: Verify LongPathsEnabled
         run: |
           $val = (Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem' -Name 'LongPathsEnabled').LongPathsEnabled
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
index 1ca54ba8207..9e00ace8647 100755
--- a/ci/tools/run-tests
+++ b/ci/tools/run-tests
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -20,6 +20,14 @@ fi
 
 test_module=${1}
 
+FREE_THREADING=""
+PYTEST_PARALLEL_ARGS=()
+if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
+  FREE_THREADING="-ft"
+  PYTEST_PARALLEL_ARGS=(--parallel-threads=4)
+  pip install pytest-run-parallel
+fi
+
 # For standard modes, install pathfinder up front (it is a direct dependency
 # of bindings, and a transitive dependency of core).  Nightly modes install
 # all wheels together in a single pip call further below.
@@ -36,7 +44,7 @@ if [[ "${test_module}" == "pathfinder" ]]; then
     "LD:${CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS} " \
     "FH:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS} " \
     "BC:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS}"
-  pytest -ra -s -v --durations=0 tests/ |& tee /tmp/pathfinder_test_log.txt
+  pytest -ra -s -v --durations=0 "${PYTEST_PARALLEL_ARGS[@]}" tests/ |& tee /tmp/pathfinder_test_log.txt
   # Report the number of "INFO test_" lines (including zero)
   # to support quick validations based on GHA log archives.
   line_count=$(awk '/^INFO test_/ {count++} END {print count+0}' /tmp/pathfinder_test_log.txt)
@@ -51,9 +59,9 @@ elif [[ "${test_module}" == "bindings" ]]; then
     pip install $(ls "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)[all] --group test
   fi
   echo "Running bindings tests"
-  ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
+  ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/
   if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then
-    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
+    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/cython
   fi
   popd
 elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
@@ -61,11 +69,6 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
   TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
   CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
 
-  FREE_THREADING=""
-  if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
-    FREE_THREADING+="-ft"
-  fi
-
   # Resolve bindings based on BINDINGS_SOURCE (set by env-vars):
   #   main/backport → local wheel from artifacts dir
   #   published     → install from PyPI by version
@@ -106,11 +109,11 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
     echo "Installed packages before core tests:"
     pip list
     echo "Running core tests"
-    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
+    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/
     # Currently our CI always installs the latest bindings (from either major version).
     # This is not compatible with the test requirements.
     if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then
-      ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
+      ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/cython
     fi
   else
     # Nightly optional-dependency testing.
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 44bdf08d810..99d19753693 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -57,11 +57,7 @@ RUN_CYTHON=0
 RUN_EXAMPLES=1
 RUN_PTDS=1
 INSTALL_MODE=auto  # auto|force|skip
-DEFAULT_PARALLEL_THREADS=0
-if python -mpip show pytest-run-parallel >/dev/null 2>&1; then
-  DEFAULT_PARALLEL_THREADS=4
-fi
-PARALLEL_THREADS=${DEFAULT_PARALLEL_THREADS}
+PARALLEL_THREADS=0
 while [[ $# -gt 0 ]]; do
   case "$1" in
     -h|--help)