diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index f2e9b7f9e..289d6f2c2 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -4,7 +4,7 @@ balanced_allocation,2026-04-14T12:00:00Z,1203,,,float32 allocation array caused bilateral,2026-05-01,,,,"No CRIT/HIGH/MEDIUM. Sigma underflow validated via sqrt(tiny) bound; oversize sigma clamped. float64 throughout numpy/cupy. NaN center returns NaN; NaN neighbors skipped (denom not incremented). w_sum>0 guard avoids div-by-zero. map_overlap depth==kernel radius. CUDA bounds correct. Inf input could yield 0*inf=NaN in v_sum but unvalidated input is general xrspatial pattern, not bilateral-specific." contour,2026-05-01,,,,"Marching squares correct: NaN check uses self-inequality, loop bounds (ny-1,nx-1) cover all quads, dask overlap depth=1 matches 2x2 stencil, float64 cast consistent across backends, saddle disambiguation via center value. No CRIT/HIGH issues; minor LOW (Inf inputs not specifically rejected) not flagged." corridor,2026-05-01,,LOW,1,"LOW: corridor inherits float32 from cost_distance; for very large accumulated costs, normalized = corridor - corridor_min loses precision near min (intrinsic to upstream dtype, not corridor itself). NaN handling correct (skipna min, np.isfinite check before normalize). All 4 backends route through pure xarray arithmetic; threshold uses dask/cupy/numpy where with try/except dispatch. No CRIT/HIGH issues." -cost_distance,2026-04-13T12:00:00Z,1191,,,CuPy Bellman-Ford max_iterations = h+w instead of h*w. Fix in PR #1192. +cost_distance,2026-06-16,3369,CRITICAL,5,"CRITICAL heap overflow (#3369/this PR): numba Dijkstra kernels _cost_distance_kernel + _cost_distance_tile_kernel sized the binary min-heap at height*width, but a lazy-deletion heap pushes a pixel on every improving relaxation, so push count exceeds h*w on non-uniform friction. _heap_push then writes OOB -> heap corruption, SIGABRT (exit 134, 'corrupted size vs prev_size') on iterative dask path; UB on numpy path. Reference heapq Dijkstra hits 44 pushes on a 6x6=36 grid. Fix: max_heap = h*w*(n_neighbors+1), tile kernel adds +2*(w+h)+4 for phase-2 boundary seeds. Verified: cupy relax kernel (parallel Bellman-Ford) does NOT use this heap, GPU path unaffected. CUDA available; numpy/cupy/dask+numpy/dask+cupy all agree post-fix over 30+40 random adversarial grids; 88 module tests pass (4 new regression tests). Cats 1-4 clean: dist float64 / out float32 fine; inf/nan/zero friction all impassable (tested); bounds guards use >=h/>=w; planar algorithm, no curvature expected. Supersedes prior #1191 (cupy max_iterations h+w->h*w, fixed PR #1192)." curvature,2026-03-30T15:00:00Z,,,,Formula matches ArcGIS reference. Backends consistent. No issues found. dasymetric,2026-04-14T12:00:00Z,,,,Mass conservation correct. Weighted/binary/limiting_variable all verified. Pycnophylactic Tobler algorithm correct. diffusion,2026-05-01,,LOW,1;2;5,"LOW: no Kahan summation across long iterations (drift over 100k steps, standard for explicit Euler); lap=n+s+w+e-4*val has catastrophic cancellation for nearly-uniform large values; res=0 in attrs causes div-by-zero (no guard); dask+cupy boundary='nan' relies on dask accepting cp.nan as fill. CPU/GPU NaN handling consistent (np.isnan vs val!=val). depth=1 matches stencil radius. Memory guards, CFL check, step cap all in place. No CRIT/HIGH." diff --git a/xrspatial/cost_distance.py b/xrspatial/cost_distance.py index d84a242c2..67ab7b6f7 100644 --- a/xrspatial/cost_distance.py +++ b/xrspatial/cost_distance.py @@ -152,10 +152,13 @@ def _cost_distance_kernel( # output: initialise to NaN (unreachable) dist = np.full((height, width), np.inf, dtype=np.float64) - # Heap arrays — worst-case each pixel is pushed once per neighbour - # but practically much less. We allocate height*width which is - # sufficient for an exact Dijkstra (each pixel settled at most once). - max_heap = height * width + # Heap arrays. A lazy-deletion min-heap enqueues a pixel every time its + # tentative distance improves, so a pixel can be pushed more than once + # before it is settled. The number of improving relaxations is bounded + # by the directed edge count (n_neighbors per pixel) plus one initial + # seed per pixel. The old height*width sizing underflows that bound and + # let _heap_push write past the end of the arrays, corrupting memory. + max_heap = height * width * (n_neighbors + 1) h_keys = np.empty(max_heap, dtype=np.float64) h_rows = np.empty(max_heap, dtype=np.int64) h_cols = np.empty(max_heap, dtype=np.int64) @@ -552,7 +555,10 @@ def _cost_distance_tile_kernel( dist = np.full((height, width), np.inf, dtype=np.float64) - max_heap = height * width + # See _cost_distance_kernel for the heap-sizing rationale. This kernel + # additionally seeds boundary pixels in phase 2 (up to 2*width + 2*height + # + 4 extra pushes), so add that headroom on top of the relaxation bound. + max_heap = height * width * (n_neighbors + 1) + 2 * (width + height) + 4 h_keys = np.empty(max_heap, dtype=np.float64) h_rows = np.empty(max_heap, dtype=np.int64) h_cols = np.empty(max_heap, dtype=np.int64) diff --git a/xrspatial/tests/test_cost_distance.py b/xrspatial/tests/test_cost_distance.py index 00dead1ec..3a187da1e 100644 --- a/xrspatial/tests/test_cost_distance.py +++ b/xrspatial/tests/test_cost_distance.py @@ -1120,3 +1120,127 @@ def test_custom_dim_names_preserved(): assert result.dims == ('lat', 'lon') assert 'lat' in result.coords and 'lon' in result.coords + + +# ----------------------------------------------------------------------- +# Heap-overflow regression: highly variable friction re-enqueues pixels +# more than once, so the heap must be sized above height*width. +# ----------------------------------------------------------------------- + +def _reference_dijkstra(source, friction, connectivity, max_cost=np.inf): + """Plain-Python multi-source Dijkstra with an unbounded heap. + + Mirrors _cost_distance_kernel exactly but uses heapq, so its heap can + never overflow. Used as ground truth for the bounded-heap kernels. + """ + import heapq + + h, w = source.shape + if connectivity == 8: + dy = [-1, -1, -1, 0, 0, 1, 1, 1] + dx = [-1, 0, 1, -1, 1, -1, 0, 1] + s2 = np.sqrt(2.0) + dd = [s2, 1.0, s2, 1.0, 1.0, s2, 1.0, s2] + else: + dy = [0, -1, 1, 0] + dx = [-1, 0, 0, 1] + dd = [1.0, 1.0, 1.0, 1.0] + + dist = np.full((h, w), np.inf) + visited = np.zeros((h, w), dtype=bool) + heap = [] + for r in range(h): + for c in range(w): + v = source[r, c] + if v != 0 and np.isfinite(v): + f = friction[r, c] + if np.isfinite(f) and f > 0: + dist[r, c] = 0.0 + heapq.heappush(heap, (0.0, r, c)) + while heap: + cu, ur, uc = heapq.heappop(heap) + if visited[ur, uc]: + continue + visited[ur, uc] = True + if cu > max_cost: + break + fu = friction[ur, uc] + for i in range(len(dy)): + vr, vc = ur + dy[i], uc + dx[i] + if vr < 0 or vr >= h or vc < 0 or vc >= w: + continue + if visited[vr, vc]: + continue + fv = friction[vr, vc] + if not (np.isfinite(fv) and fv > 0): + continue + nc = cu + dd[i] * (fu + fv) * 0.5 + if nc < dist[vr, vc]: + dist[vr, vc] = nc + heapq.heappush(heap, (nc, vr, vc)) + return np.where(np.isinf(dist) | (dist > max_cost), np.nan, dist).astype( + np.float32) + + +@pytest.mark.parametrize("connectivity", [4, 8]) +def test_numpy_heap_no_overflow_variable_friction(connectivity): + """Highly variable friction must not overflow the bounded heap. + + Regression for the height*width heap allocation: a lazy-deletion + Dijkstra re-enqueues pixels whenever their tentative cost improves, so + on a grid with strongly varying friction the number of pushes exceeds + height*width and _heap_push wrote past the array end (memory corruption + or a wrong result). Compare against a reference heapq Dijkstra over + many adversarial grids. + """ + rng = np.random.default_rng(20240616 + connectivity) + for _ in range(50): + h = int(rng.integers(4, 9)) + w = int(rng.integers(4, 9)) + friction_data = rng.uniform(0.05, 12.0, (h, w)) + source = np.zeros((h, w)) + for _ in range(int(rng.integers(1, 4))): + source[rng.integers(0, h), rng.integers(0, w)] = 1.0 + + out = _compute(cost_distance( + _make_raster(source, backend='numpy'), + _make_raster(friction_data, backend='numpy'), + connectivity=connectivity, + )) + expected = _reference_dijkstra(source, friction_data, connectivity) + np.testing.assert_allclose(out, expected, equal_nan=True, atol=1e-4) + + +@pytest.mark.skipif(da is None, reason="dask not installed") +@pytest.mark.parametrize("connectivity", [4, 8]) +def test_iterative_heap_no_overflow_variable_friction(connectivity): + """Iterative tile Dijkstra must not overflow its bounded heap. + + The tile kernel seeds boundary pixels on top of the relaxation pushes, + so its heap needs even more headroom. Run the unbounded (max_cost=inf) + iterative path on variable friction with uneven chunks and compare to a + reference heapq Dijkstra. + """ + import warnings + + rng = np.random.default_rng(31415 + connectivity) + for _ in range(20): + h = int(rng.integers(12, 22)) + w = int(rng.integers(12, 22)) + friction_data = rng.uniform(0.05, 12.0, (h, w)) + source = np.zeros((h, w)) + for _ in range(int(rng.integers(1, 4))): + source[rng.integers(0, h), rng.integers(0, w)] = 1.0 + + cy = int(rng.integers(4, h // 2 + 1)) + cx = int(rng.integers(4, w // 2 + 1)) + raster = _make_raster(source, backend='dask+numpy', chunks=(cy, cx)) + friction = _make_raster( + friction_data, backend='dask+numpy', chunks=(cy, cx)) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + out = _compute(cost_distance( + raster, friction, connectivity=connectivity)) + expected = _reference_dijkstra(source, friction_data, connectivity) + np.testing.assert_allclose(out, expected, equal_nan=True, atol=1e-3)