From dca032d7d61eb34652647888ab6cc6358df4a529 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 16:45:27 -0700 Subject: [PATCH 1/2] Fix cost_distance heap overflow on non-uniform friction (#3369) The numba Dijkstra kernels sized their binary min-heap at height*width. A lazy-deletion min-heap pushes a pixel every time its tentative cost improves, so the push count exceeds height*width on grids with varying friction, and _heap_push wrote past the end of the heap arrays. That is an out-of-bounds write into numba-managed memory, which corrupts the heap and aborts the process (SIGABRT) on the iterative dask path and is undefined behavior on the numpy path. Size the heap to the real bound: directed edges plus one seed per pixel, height*width*(n_neighbors+1). The tile kernel adds boundary-seed headroom on top. Add regression tests that compare both the numpy and iterative dask paths against a reference heapq Dijkstra over many adversarial grids. --- xrspatial/cost_distance.py | 16 ++-- xrspatial/tests/test_cost_distance.py | 124 ++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 5 deletions(-) diff --git a/xrspatial/cost_distance.py b/xrspatial/cost_distance.py index d84a242c2..67ab7b6f7 100644 --- a/xrspatial/cost_distance.py +++ b/xrspatial/cost_distance.py @@ -152,10 +152,13 @@ def _cost_distance_kernel( # output: initialise to NaN (unreachable) dist = np.full((height, width), np.inf, dtype=np.float64) - # Heap arrays — worst-case each pixel is pushed once per neighbour - # but practically much less. We allocate height*width which is - # sufficient for an exact Dijkstra (each pixel settled at most once). - max_heap = height * width + # Heap arrays. A lazy-deletion min-heap enqueues a pixel every time its + # tentative distance improves, so a pixel can be pushed more than once + # before it is settled. The number of improving relaxations is bounded + # by the directed edge count (n_neighbors per pixel) plus one initial + # seed per pixel. The old height*width sizing underflows that bound and + # let _heap_push write past the end of the arrays, corrupting memory. + max_heap = height * width * (n_neighbors + 1) h_keys = np.empty(max_heap, dtype=np.float64) h_rows = np.empty(max_heap, dtype=np.int64) h_cols = np.empty(max_heap, dtype=np.int64) @@ -552,7 +555,10 @@ def _cost_distance_tile_kernel( dist = np.full((height, width), np.inf, dtype=np.float64) - max_heap = height * width + # See _cost_distance_kernel for the heap-sizing rationale. This kernel + # additionally seeds boundary pixels in phase 2 (up to 2*width + 2*height + # + 4 extra pushes), so add that headroom on top of the relaxation bound. + max_heap = height * width * (n_neighbors + 1) + 2 * (width + height) + 4 h_keys = np.empty(max_heap, dtype=np.float64) h_rows = np.empty(max_heap, dtype=np.int64) h_cols = np.empty(max_heap, dtype=np.int64) diff --git a/xrspatial/tests/test_cost_distance.py b/xrspatial/tests/test_cost_distance.py index 00dead1ec..3a187da1e 100644 --- a/xrspatial/tests/test_cost_distance.py +++ b/xrspatial/tests/test_cost_distance.py @@ -1120,3 +1120,127 @@ def test_custom_dim_names_preserved(): assert result.dims == ('lat', 'lon') assert 'lat' in result.coords and 'lon' in result.coords + + +# ----------------------------------------------------------------------- +# Heap-overflow regression: highly variable friction re-enqueues pixels +# more than once, so the heap must be sized above height*width. +# ----------------------------------------------------------------------- + +def _reference_dijkstra(source, friction, connectivity, max_cost=np.inf): + """Plain-Python multi-source Dijkstra with an unbounded heap. + + Mirrors _cost_distance_kernel exactly but uses heapq, so its heap can + never overflow. Used as ground truth for the bounded-heap kernels. + """ + import heapq + + h, w = source.shape + if connectivity == 8: + dy = [-1, -1, -1, 0, 0, 1, 1, 1] + dx = [-1, 0, 1, -1, 1, -1, 0, 1] + s2 = np.sqrt(2.0) + dd = [s2, 1.0, s2, 1.0, 1.0, s2, 1.0, s2] + else: + dy = [0, -1, 1, 0] + dx = [-1, 0, 0, 1] + dd = [1.0, 1.0, 1.0, 1.0] + + dist = np.full((h, w), np.inf) + visited = np.zeros((h, w), dtype=bool) + heap = [] + for r in range(h): + for c in range(w): + v = source[r, c] + if v != 0 and np.isfinite(v): + f = friction[r, c] + if np.isfinite(f) and f > 0: + dist[r, c] = 0.0 + heapq.heappush(heap, (0.0, r, c)) + while heap: + cu, ur, uc = heapq.heappop(heap) + if visited[ur, uc]: + continue + visited[ur, uc] = True + if cu > max_cost: + break + fu = friction[ur, uc] + for i in range(len(dy)): + vr, vc = ur + dy[i], uc + dx[i] + if vr < 0 or vr >= h or vc < 0 or vc >= w: + continue + if visited[vr, vc]: + continue + fv = friction[vr, vc] + if not (np.isfinite(fv) and fv > 0): + continue + nc = cu + dd[i] * (fu + fv) * 0.5 + if nc < dist[vr, vc]: + dist[vr, vc] = nc + heapq.heappush(heap, (nc, vr, vc)) + return np.where(np.isinf(dist) | (dist > max_cost), np.nan, dist).astype( + np.float32) + + +@pytest.mark.parametrize("connectivity", [4, 8]) +def test_numpy_heap_no_overflow_variable_friction(connectivity): + """Highly variable friction must not overflow the bounded heap. + + Regression for the height*width heap allocation: a lazy-deletion + Dijkstra re-enqueues pixels whenever their tentative cost improves, so + on a grid with strongly varying friction the number of pushes exceeds + height*width and _heap_push wrote past the array end (memory corruption + or a wrong result). Compare against a reference heapq Dijkstra over + many adversarial grids. + """ + rng = np.random.default_rng(20240616 + connectivity) + for _ in range(50): + h = int(rng.integers(4, 9)) + w = int(rng.integers(4, 9)) + friction_data = rng.uniform(0.05, 12.0, (h, w)) + source = np.zeros((h, w)) + for _ in range(int(rng.integers(1, 4))): + source[rng.integers(0, h), rng.integers(0, w)] = 1.0 + + out = _compute(cost_distance( + _make_raster(source, backend='numpy'), + _make_raster(friction_data, backend='numpy'), + connectivity=connectivity, + )) + expected = _reference_dijkstra(source, friction_data, connectivity) + np.testing.assert_allclose(out, expected, equal_nan=True, atol=1e-4) + + +@pytest.mark.skipif(da is None, reason="dask not installed") +@pytest.mark.parametrize("connectivity", [4, 8]) +def test_iterative_heap_no_overflow_variable_friction(connectivity): + """Iterative tile Dijkstra must not overflow its bounded heap. + + The tile kernel seeds boundary pixels on top of the relaxation pushes, + so its heap needs even more headroom. Run the unbounded (max_cost=inf) + iterative path on variable friction with uneven chunks and compare to a + reference heapq Dijkstra. + """ + import warnings + + rng = np.random.default_rng(31415 + connectivity) + for _ in range(20): + h = int(rng.integers(12, 22)) + w = int(rng.integers(12, 22)) + friction_data = rng.uniform(0.05, 12.0, (h, w)) + source = np.zeros((h, w)) + for _ in range(int(rng.integers(1, 4))): + source[rng.integers(0, h), rng.integers(0, w)] = 1.0 + + cy = int(rng.integers(4, h // 2 + 1)) + cx = int(rng.integers(4, w // 2 + 1)) + raster = _make_raster(source, backend='dask+numpy', chunks=(cy, cx)) + friction = _make_raster( + friction_data, backend='dask+numpy', chunks=(cy, cx)) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + out = _compute(cost_distance( + raster, friction, connectivity=connectivity)) + expected = _reference_dijkstra(source, friction_data, connectivity) + np.testing.assert_allclose(out, expected, equal_nan=True, atol=1e-3) From 2e71b8aaebc0d683906e2bd9633643dd54316489 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 16:46:08 -0700 Subject: [PATCH 2/2] Update accuracy sweep state for cost_distance (#3369) --- .claude/sweep-accuracy-state.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index f2e9b7f9e..289d6f2c2 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -4,7 +4,7 @@ balanced_allocation,2026-04-14T12:00:00Z,1203,,,float32 allocation array caused bilateral,2026-05-01,,,,"No CRIT/HIGH/MEDIUM. Sigma underflow validated via sqrt(tiny) bound; oversize sigma clamped. float64 throughout numpy/cupy. NaN center returns NaN; NaN neighbors skipped (denom not incremented). w_sum>0 guard avoids div-by-zero. map_overlap depth==kernel radius. CUDA bounds correct. Inf input could yield 0*inf=NaN in v_sum but unvalidated input is general xrspatial pattern, not bilateral-specific." contour,2026-05-01,,,,"Marching squares correct: NaN check uses self-inequality, loop bounds (ny-1,nx-1) cover all quads, dask overlap depth=1 matches 2x2 stencil, float64 cast consistent across backends, saddle disambiguation via center value. No CRIT/HIGH issues; minor LOW (Inf inputs not specifically rejected) not flagged." corridor,2026-05-01,,LOW,1,"LOW: corridor inherits float32 from cost_distance; for very large accumulated costs, normalized = corridor - corridor_min loses precision near min (intrinsic to upstream dtype, not corridor itself). NaN handling correct (skipna min, np.isfinite check before normalize). All 4 backends route through pure xarray arithmetic; threshold uses dask/cupy/numpy where with try/except dispatch. No CRIT/HIGH issues." -cost_distance,2026-04-13T12:00:00Z,1191,,,CuPy Bellman-Ford max_iterations = h+w instead of h*w. Fix in PR #1192. +cost_distance,2026-06-16,3369,CRITICAL,5,"CRITICAL heap overflow (#3369/this PR): numba Dijkstra kernels _cost_distance_kernel + _cost_distance_tile_kernel sized the binary min-heap at height*width, but a lazy-deletion heap pushes a pixel on every improving relaxation, so push count exceeds h*w on non-uniform friction. _heap_push then writes OOB -> heap corruption, SIGABRT (exit 134, 'corrupted size vs prev_size') on iterative dask path; UB on numpy path. Reference heapq Dijkstra hits 44 pushes on a 6x6=36 grid. Fix: max_heap = h*w*(n_neighbors+1), tile kernel adds +2*(w+h)+4 for phase-2 boundary seeds. Verified: cupy relax kernel (parallel Bellman-Ford) does NOT use this heap, GPU path unaffected. CUDA available; numpy/cupy/dask+numpy/dask+cupy all agree post-fix over 30+40 random adversarial grids; 88 module tests pass (4 new regression tests). Cats 1-4 clean: dist float64 / out float32 fine; inf/nan/zero friction all impassable (tested); bounds guards use >=h/>=w; planar algorithm, no curvature expected. Supersedes prior #1191 (cupy max_iterations h+w->h*w, fixed PR #1192)." curvature,2026-03-30T15:00:00Z,,,,Formula matches ArcGIS reference. Backends consistent. No issues found. dasymetric,2026-04-14T12:00:00Z,,,,Mass conservation correct. Weighted/binary/limiting_variable all verified. Pycnophylactic Tobler algorithm correct. diffusion,2026-05-01,,LOW,1;2;5,"LOW: no Kahan summation across long iterations (drift over 100k steps, standard for explicit Euler); lap=n+s+w+e-4*val has catastrophic cancellation for nearly-uniform large values; res=0 in attrs causes div-by-zero (no guard); dask+cupy boundary='nan' relies on dask accepting cp.nan as fill. CPU/GPU NaN handling consistent (np.isnan vs val!=val). depth=1 matches stencil radius. Memory guards, CFL check, step cap all in place. No CRIT/HIGH."