From ffd1f915efe5f90323daa7bfb2a6073a3f702110 Mon Sep 17 00:00:00 2001
From: Tanmay Rauth <t_rauth@apple.com>
Date: Sun, 14 Jun 2026 15:51:15 +0530
Subject: [PATCH] Fix strict metrics evaluator over-pruning files for
 NotEqualTo/NotIn with partial nulls

_StrictMetricsEvaluator.visit_not_equal and visit_not_in short-circuited on
_can_contain_nulls / _can_contain_nans (null/NaN count > 0) and returned
ROWS_MUST_MATCH without checking the value bounds. A file holding any null or
NaN was therefore reported as fully matching the predicate, even when a
non-null value inside the bounds did not match.

This drives _DeleteFiles (table/update/snapshot.py): ROWS_MUST_MATCH drops the
whole data file without rewriting it. So delete(NotEqualTo("x", 5)) against a
file with stats [null, 5] and bounds lower=upper=5 would delete the entire
file, silently losing the row with value 5 that should have survived.

Every other strict ROWS_MUST_MATCH path already guards on the "only" variants
(_contains_nulls_only / _contains_nans_only), matching the reference
StrictMetricsEvaluator. Switch both methods to the same guard so that an
all-null/all-NaN column still short-circuits to ROWS_MUST_MATCH (those rows
satisfy not-equal/not-in), while a partially-null column falls through to the
bounds check.

Update the existing NotIn-on-some-nulls test that encoded the buggy result and
add a regression test covering the [null, value] / bounds-include-literal case
for both NotEqualTo and NotIn.

Fixes #3498 (partially)
---
 pyiceberg/expressions/visitors.py   | 10 ++++++--
 tests/expressions/test_evaluator.py | 38 ++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py
index f3cfabf1a5..11d0942d46 100644
--- a/pyiceberg/expressions/visitors.py
+++ b/pyiceberg/expressions/visitors.py
@@ -1668,7 +1668,10 @@ def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool:
         # Rows must match when X < Min or Max < X because it is not in the range
         field_id = term.ref().field.field_id
 
-        if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id):
+        # When the column is entirely null or entirely NaN there is no value equal to the
+        # literal, so every row satisfies the predicate. A merely partial presence of nulls
+        # or NaNs does not guarantee this, so we must fall through to the bounds check below.
+        if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id):
             return ROWS_MUST_MATCH
 
         field = self._get_field(field_id)
@@ -1728,7 +1731,10 @@ def visit_in(self, term: BoundTerm, literals: set[L]) -> bool:
     def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool:
         field_id = term.ref().field.field_id
 
-        if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id):
+        # When the column is entirely null or entirely NaN none of the values are in the set,
+        # so every row satisfies the predicate. A merely partial presence of nulls or NaNs does
+        # not guarantee this, so we must fall through to the bounds check below.
+        if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id):
             return ROWS_MUST_MATCH
 
         field = self._get_field(field_id)
diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py
index ea1bef0a7d..9123cd44b2 100644
--- a/tests/expressions/test_evaluator.py
+++ b/tests/expressions/test_evaluator.py
@@ -1523,12 +1523,48 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file
     assert should_read, "Should match: notIn on all nulls column"
 
     should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
-    assert should_read, "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'"
+    assert not should_read, "Should not match: some non-null value may be == 'def' which is within bounds ['bbb', 'eee']"
 
     should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
     assert not should_read, "Should not match: no_nulls field does not have bounds"
 
 
+def test_strict_not_eq_partial_nulls_within_bounds() -> None:
+    # Regression test for https://github.com/apache/iceberg-python/issues/3498
+    # A column that contains *some* nulls (but not only nulls) whose bounds still cover the
+    # literal must not be reported as ROWS_MUST_MATCH: the non-null value equal to the literal
+    # does not satisfy the predicate. Reporting a match here lets _DeleteFiles drop the whole
+    # data file and silently lose the row that should have survived the delete.
+    schema = Schema(NestedField(1, "x", IntegerType(), required=False))
+    data_file = DataFile.from_args(
+        file_path="file.parquet",
+        file_format=FileFormat.PARQUET,
+        partition=Record(),
+        record_count=2,
+        value_counts={1: 2},
+        null_value_counts={1: 1},  # one null, one non-null -> not "nulls only"
+        nan_value_counts={},
+        lower_bounds={1: to_bytes(IntegerType(), 5)},
+        upper_bounds={1: to_bytes(IntegerType(), 5)},  # the only non-null value is 5
+    )
+
+    assert not _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file), (
+        "Should not match: the non-null value 5 does not satisfy x != 5"
+    )
+    assert not _StrictMetricsEvaluator(schema, NotIn("x", {5})).eval(data_file), (
+        "Should not match: the non-null value 5 is in {5}"
+    )
+
+    # The literal sits outside the bounds, so every non-null value satisfies the predicate and
+    # the remaining nulls/NaNs also satisfy it -> the whole file matches.
+    assert _StrictMetricsEvaluator(schema, NotEqualTo("x", 6)).eval(data_file), (
+        "Should match: no value equals 6 and nulls satisfy x != 6"
+    )
+    assert _StrictMetricsEvaluator(schema, NotIn("x", {6})).eval(data_file), (
+        "Should match: no value is in {6} and nulls satisfy not-in"
+    )
+
+
 @pytest.mark.parametrize(
     "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected",
     [