diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index f3cfabf1a5..11d0942d46 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -1668,7 +1668,10 @@ def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when X < Min or Max < X because it is not in the range field_id = term.ref().field.field_id - if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): + # When the column is entirely null or entirely NaN there is no value equal to the + # literal, so every row satisfies the predicate. A merely partial presence of nulls + # or NaNs does not guarantee this, so we must fall through to the bounds check below. + if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id): return ROWS_MUST_MATCH field = self._get_field(field_id) @@ -1728,7 +1731,10 @@ def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool: field_id = term.ref().field.field_id - if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): + # When the column is entirely null or entirely NaN none of the values are in the set, + # so every row satisfies the predicate. A merely partial presence of nulls or NaNs does + # not guarantee this, so we must fall through to the bounds check below. + if self._contains_nulls_only(field_id) or self._contains_nans_only(field_id): return ROWS_MUST_MATCH field = self._get_field(field_id) diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index ea1bef0a7d..9123cd44b2 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -1523,12 +1523,48 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file assert should_read, "Should match: notIn on all nulls column" should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("some_nulls", {"abc", "def"})).eval(strict_data_file_1) - assert should_read, "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'" + assert not should_read, "Should not match: some non-null value may be == 'def' which is within bounds ['bbb', 'eee']" should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1) assert not should_read, "Should not match: no_nulls field does not have bounds" +def test_strict_not_eq_partial_nulls_within_bounds() -> None: + # Regression test for https://github.com/apache/iceberg-python/issues/3498 + # A column that contains *some* nulls (but not only nulls) whose bounds still cover the + # literal must not be reported as ROWS_MUST_MATCH: the non-null value equal to the literal + # does not satisfy the predicate. Reporting a match here lets _DeleteFiles drop the whole + # data file and silently lose the row that should have survived the delete. + schema = Schema(NestedField(1, "x", IntegerType(), required=False)) + data_file = DataFile.from_args( + file_path="file.parquet", + file_format=FileFormat.PARQUET, + partition=Record(), + record_count=2, + value_counts={1: 2}, + null_value_counts={1: 1}, # one null, one non-null -> not "nulls only" + nan_value_counts={}, + lower_bounds={1: to_bytes(IntegerType(), 5)}, + upper_bounds={1: to_bytes(IntegerType(), 5)}, # the only non-null value is 5 + ) + + assert not _StrictMetricsEvaluator(schema, NotEqualTo("x", 5)).eval(data_file), ( + "Should not match: the non-null value 5 does not satisfy x != 5" + ) + assert not _StrictMetricsEvaluator(schema, NotIn("x", {5})).eval(data_file), ( + "Should not match: the non-null value 5 is in {5}" + ) + + # The literal sits outside the bounds, so every non-null value satisfies the predicate and + # the remaining nulls/NaNs also satisfy it -> the whole file matches. + assert _StrictMetricsEvaluator(schema, NotEqualTo("x", 6)).eval(data_file), ( + "Should match: no value equals 6 and nulls satisfy x != 6" + ) + assert _StrictMetricsEvaluator(schema, NotIn("x", {6})).eval(data_file), ( + "Should match: no value is in {6} and nulls satisfy not-in" + ) + + @pytest.mark.parametrize( "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", [