From 5879d69c6f0ab9c234f0b2783a3ff606b9f3533a Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Sat, 13 Jun 2026 11:22:37 +0800 Subject: [PATCH 1/2] Prune IN predicates with file stats --- crates/paimon/src/predicate_stats.rs | 8 ++- crates/paimon/src/table/table_scan.rs | 74 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/crates/paimon/src/predicate_stats.rs b/crates/paimon/src/predicate_stats.rs index fc1a79a8..548f524d 100644 --- a/crates/paimon/src/predicate_stats.rs +++ b/crates/paimon/src/predicate_stats.rs @@ -59,7 +59,8 @@ pub(crate) fn data_leaf_may_match( PredicateOperator::IsNotNull => { return all_null != Some(true); } - PredicateOperator::In | PredicateOperator::NotIn => { + PredicateOperator::In => {} + PredicateOperator::NotIn => { return true; } PredicateOperator::EndsWith | PredicateOperator::Contains => { @@ -113,6 +114,10 @@ pub(crate) fn data_leaf_may_match( }; match op { + PredicateOperator::In => literals.iter().any(|literal| { + !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) + && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) + }), PredicateOperator::Eq => { !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) @@ -180,7 +185,6 @@ pub(crate) fn data_leaf_may_match( } PredicateOperator::IsNull | PredicateOperator::IsNotNull - | PredicateOperator::In | PredicateOperator::NotIn | PredicateOperator::EndsWith | PredicateOperator::Contains diff --git a/crates/paimon/src/table/table_scan.rs b/crates/paimon/src/table/table_scan.rs index 57f21d8d..f66a6be6 100644 --- a/crates/paimon/src/table/table_scan.rs +++ b/crates/paimon/src/table/table_scan.rs @@ -2005,6 +2005,80 @@ mod tests { ); } + #[test] + fn test_data_file_matches_in_prunes_when_all_literals_out_of_range() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(10)), + int_stats_row(Some(20)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(1), Datum::Int(30)]) + .unwrap(); + + assert!(!data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_keeps_when_any_literal_in_range() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(10)), + int_stats_row(Some(20)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(1), Datum::Int(15), Datum::Int(30)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_prunes_all_null_file() { + let fields = int_field(); + let file = test_data_file_meta(int_stats_row(None), int_stats_row(None), vec![Some(5)], 5); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(10)]) + .unwrap(); + + assert!(!data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_with_corrupt_stats_fails_open() { + let fields = int_field(); + let file = test_data_file_meta(Vec::new(), Vec::new(), vec![Some(0)], 5); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(30)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + #[test] fn test_data_file_matches_is_null_prunes_when_null_count_is_zero() { let fields = int_field(); From 573c4f0b31fd56005a4632eec7ae834eae40c277 Mon Sep 17 00:00:00 2001 From: Qiwei Huang Date: Fri, 3 Jul 2026 21:59:16 +0800 Subject: [PATCH 2/2] Address IN stats pruning review feedback --- crates/paimon/src/predicate_stats.rs | 30 +++++++++++++++--- crates/paimon/src/table/stats_filter.rs | 7 +++++ crates/paimon/src/table/table_scan.rs | 42 +++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/crates/paimon/src/predicate_stats.rs b/crates/paimon/src/predicate_stats.rs index 548f524d..d293d5c1 100644 --- a/crates/paimon/src/predicate_stats.rs +++ b/crates/paimon/src/predicate_stats.rs @@ -23,6 +23,9 @@ pub(crate) trait StatsAccessor { fn null_count(&self, index: usize) -> Option; fn min_value(&self, index: usize, data_type: &DataType) -> Option; fn max_value(&self, index: usize, data_type: &DataType) -> Option; + fn supports_in_min_max_pruning(&self) -> bool { + false + } } pub(crate) fn predicates_may_match_with_schema( @@ -59,7 +62,10 @@ pub(crate) fn data_leaf_may_match( PredicateOperator::IsNotNull => { return all_null != Some(true); } - PredicateOperator::In => {} + PredicateOperator::In if stats.supports_in_min_max_pruning() => {} + PredicateOperator::In => { + return true; + } PredicateOperator::NotIn => { return true; } @@ -114,10 +120,18 @@ pub(crate) fn data_leaf_may_match( }; match op { - PredicateOperator::In => literals.iter().any(|literal| { - !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) - && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) - }), + PredicateOperator::In => { + if !matches!( + min_value.partial_cmp(&max_value), + Some(Ordering::Less | Ordering::Equal) + ) { + return true; + } + literals.iter().any(|literal| { + !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) + && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) + }) + } PredicateOperator::Eq => { !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) @@ -530,6 +544,12 @@ mod tests { data_leaf_may_match(0, &dt, &dt, op, lits, stats) } + #[test] + fn in_falls_open_when_accessor_does_not_opt_in() { + let stats = int_stats(10, 20); + assert!(run_int(PredicateOperator::In, &[Datum::Int(30)], &stats)); + } + /// Stage 3 invariant: a `Between` leaf and the equivalent `GtEq+LtEq` /// conjunction must produce identical stats-prune verdicts. If they /// diverge, the DataFusion translator switch (And-of-comparisons → diff --git a/crates/paimon/src/table/stats_filter.rs b/crates/paimon/src/table/stats_filter.rs index 999213d5..63cf5209 100644 --- a/crates/paimon/src/table/stats_filter.rs +++ b/crates/paimon/src/table/stats_filter.rs @@ -32,6 +32,7 @@ pub(super) struct FileStatsRows { min_values: Option, max_values: Option, null_counts: Vec>, + supports_in_min_max_pruning: bool, /// Maps schema field index → stats index. `None` means identity mapping /// (stats cover all schema fields in order). `Some` is used when /// `value_stats_cols` or `write_cols` is present (dense mode). @@ -51,6 +52,7 @@ impl FileStatsRows { min_values, max_values, null_counts, + supports_in_min_max_pruning: false, stats_col_mapping: None, } } @@ -92,6 +94,7 @@ impl FileStatsRows { min_values: BinaryRow::from_serialized_bytes(file.value_stats.min_values()).ok(), max_values: BinaryRow::from_serialized_bytes(file.value_stats.max_values()).ok(), null_counts: file.value_stats.null_counts().clone(), + supports_in_min_max_pruning: true, stats_col_mapping, } } @@ -132,6 +135,10 @@ impl StatsAccessor for FileStatsRows { .as_ref() .and_then(|row| extract_stats_datum(row, stats_index, data_type)) } + + fn supports_in_min_max_pruning(&self) -> bool { + self.supports_in_min_max_pruning + } } #[derive(Debug)] diff --git a/crates/paimon/src/table/table_scan.rs b/crates/paimon/src/table/table_scan.rs index f66a6be6..8c5bb790 100644 --- a/crates/paimon/src/table/table_scan.rs +++ b/crates/paimon/src/table/table_scan.rs @@ -2079,6 +2079,48 @@ mod tests { )); } + #[test] + fn test_data_file_matches_in_with_inverted_stats_fails_open() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(20)), + int_stats_row(Some(10)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(15)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_not_in_fails_open() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(10)), + int_stats_row(Some(20)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_not_in("id", vec![Datum::Int(10), Datum::Int(20)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + #[test] fn test_data_file_matches_is_null_prunes_when_null_count_is_zero() { let fields = int_field();