datafusion_physical_expr/utils/guarantee.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{PhysicalExpr, split_conjunction};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65/// column takes on any other value, the predicate can not evaluate to `true`.
66/// For example,
67/// `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70/// If the column can ONLY take one of these values, the predicate can not
71/// evaluate to `true`. For example,
72/// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75 pub column: Column,
76 pub guarantee: Guarantee,
77 pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83 /// Guarantee that the expression is `true` if `column` is one of the values. If
84 /// `column` is not one of the values, the expression can not be `true`.
85 In,
86 /// Guarantee that the expression is `true` if `column` is not ANY of the
87 /// values. If `column` only takes one of these values, the expression can
88 /// not be `true`.
89 NotIn,
90}
91
92impl LiteralGuarantee {
93 /// Create a new instance of the guarantee if the provided operator is
94 /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95 /// create these structures from an predicate (boolean expression).
96 #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
97 fn new<'a>(
98 column_name: impl Into<String>,
99 guarantee: Guarantee,
100 literals: impl IntoIterator<Item = &'a ScalarValue>,
101 ) -> Self {
102 let literals: HashSet<_> = literals.into_iter().cloned().collect();
103
104 Self {
105 column: Column::from_name(column_name),
106 guarantee,
107 literals,
108 }
109 }
110
111 /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
112 /// to evaluate to `true`.
113 ///
114 /// If more than one `LiteralGuarantee` is returned, they must **all** hold
115 /// for the expression to possibly be `true`. If any is not satisfied, the
116 /// expression is guaranteed to be `null` or `false`.
117 ///
118 /// # Notes:
119 /// 1. `expr` must be a boolean expression or inlist expression.
120 /// 2. `expr` is not simplified prior to analysis.
121 pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
122 // split conjunction: <expr> AND <expr> AND ...
123 split_conjunction(expr)
124 .into_iter()
125 // for an `AND` conjunction to be true, all terms individually must be true
126 .fold(GuaranteeBuilder::new(), |builder, expr| {
127 if let Some(cel) = ColOpLit::try_new(expr) {
128 builder.aggregate_conjunct(&cel)
129 } else if let Some(inlist) =
130 expr.downcast_ref::<crate::expressions::InListExpr>()
131 {
132 if let Some(inlist) = ColInList::try_new(inlist) {
133 builder.aggregate_multi_conjunct(
134 inlist.col,
135 inlist.guarantee,
136 inlist.list.iter().map(|lit| lit.value()),
137 )
138 } else {
139 builder
140 }
141 } else {
142 // split disjunction: <expr> OR <expr> OR ...
143 let disjunctions = split_disjunction(expr);
144
145 // We are trying to add a guarantee that a column must be
146 // in/not in a particular set of values for the expression
147 // to evaluate to true.
148 //
149 // A disjunction is true, if at least one of the terms is be
150 // true.
151 //
152 // Thus, we can infer a guarantee if all terms are of the
153 // form `(col <op> literal) OR (col <op> literal) OR ...`.
154 //
155 // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
156 // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
157 //
158 // However, for something like `a = 1 OR a = 2 OR a < 0` we
159 // **can't** guarantee that the predicate is only true if a
160 // is in (`1`, `2`), as it could also be true if `a` were less
161 // than zero.
162 let terms = disjunctions
163 .iter()
164 .filter_map(|expr| ColOpLit::try_new(expr))
165 .collect::<Vec<_>>();
166
167 // if all terms are 'col <op> literal' with the same column
168 // and operation we can infer any guarantees
169 //
170 // For those like (a != foo AND (a != bar OR a != baz)).
171 // We can't combine the (a != bar OR a != baz) part, but
172 // it also doesn't invalidate our knowledge that a !=
173 // foo is required for the expression to be true.
174 // So we can only create a multi value guarantee for `=`
175 // (or a single value). (e.g. ignore `a != foo OR a != bar`)
176 let first_term = terms.first();
177 if !terms.is_empty()
178 && terms.len() == disjunctions.len()
179 && terms.iter().all(|term| {
180 term.col.name() == first_term.unwrap().col.name()
181 && term.guarantee == Guarantee::In
182 })
183 {
184 builder.aggregate_multi_conjunct(
185 first_term.unwrap().col,
186 Guarantee::In,
187 terms.iter().map(|term| term.lit.value()),
188 )
189 } else {
190 // Handle disjunctions with conjunctions like (a = 1 AND b = 2) OR (a = 2 AND b = 3)
191 // Extract termsets from each disjunction
192 // if in each termset, they have same column, and the guarantee is In,
193 // we can infer a guarantee for the column
194 // e.g. (a = 1 AND b = 2) OR (a = 2 AND b = 3) is `a IN (1, 2) AND b IN (2, 3)`
195 // otherwise, we can't infer a guarantee
196 let termsets: Vec<Vec<ColOpLitOrInList>> = disjunctions
197 .iter()
198 .map(|expr| {
199 split_conjunction(expr)
200 .into_iter()
201 .filter_map(ColOpLitOrInList::try_new)
202 .filter(|term| term.guarantee() == Guarantee::In)
203 .collect()
204 })
205 .collect();
206
207 // Early return if any termset is empty (can't infer guarantees)
208 if termsets.iter().any(|terms| terms.is_empty()) {
209 return builder;
210 }
211
212 // Find columns that appear in all termsets
213 let common_cols = find_common_columns(&termsets);
214 if common_cols.is_empty() {
215 return builder;
216 }
217
218 // Build guarantees for common columns
219 let mut builder = builder;
220 for col in common_cols {
221 let literals: Vec<_> = termsets
222 .iter()
223 .filter_map(|terms| {
224 terms.iter().find(|term| term.col() == col).map(
225 |term| {
226 term.lits().into_iter().map(|lit| lit.value())
227 },
228 )
229 })
230 .flatten()
231 .collect();
232
233 builder = builder.aggregate_multi_conjunct(
234 col,
235 Guarantee::In,
236 literals,
237 );
238 }
239
240 builder
241 }
242 }
243 })
244 .build()
245 }
246}
247
248impl Display for LiteralGuarantee {
249 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
250 let mut sorted_literals: Vec<_> =
251 self.literals.iter().map(|lit| lit.to_string()).collect();
252 sorted_literals.sort();
253 match self.guarantee {
254 Guarantee::In => write!(
255 f,
256 "{} in ({})",
257 self.column.name,
258 sorted_literals.join(", ")
259 ),
260 Guarantee::NotIn => write!(
261 f,
262 "{} not in ({})",
263 self.column.name,
264 sorted_literals.join(", ")
265 ),
266 }
267 }
268}
269
270/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
271/// preserving insert order
272#[derive(Debug, Default)]
273struct GuaranteeBuilder<'a> {
274 /// List of guarantees that have been created so far
275 /// if we have determined a subsequent conjunct invalidates a guarantee
276 /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
277 guarantees: Vec<Option<LiteralGuarantee>>,
278
279 /// Key is the (column name, guarantee type)
280 /// Value is the index into `guarantees`
281 map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
282}
283
284impl<'a> GuaranteeBuilder<'a> {
285 fn new() -> Self {
286 Default::default()
287 }
288
289 /// Aggregate a new single `AND col <op> literal` term to this builder
290 /// combining with existing guarantees if possible.
291 ///
292 /// # Examples
293 /// * `AND (a = 1)`: `a` is guaranteed to be 1
294 /// * `AND (a != 1)`: a is guaranteed to not be 1
295 fn aggregate_conjunct(self, col_op_lit: &ColOpLit<'a>) -> Self {
296 self.aggregate_multi_conjunct(
297 col_op_lit.col,
298 col_op_lit.guarantee,
299 [col_op_lit.lit.value()],
300 )
301 }
302
303 /// Aggregates a new single column, multi literal term to this builder
304 /// combining with previously known guarantees if possible.
305 ///
306 /// # Examples
307 /// For the following examples, we can guarantee the expression is `true` if:
308 /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
309 /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
310 /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
311 /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
312 #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
313 fn aggregate_multi_conjunct(
314 mut self,
315 col: &'a crate::expressions::Column,
316 guarantee: Guarantee,
317 new_values: impl IntoIterator<Item = &'a ScalarValue>,
318 ) -> Self {
319 let key = (col, guarantee);
320 if let Some(index) = self.map.get(&key) {
321 // already have a guarantee for this column
322 let entry = &mut self.guarantees[*index];
323
324 let Some(existing) = entry else {
325 // determined the previous guarantee for this column has been
326 // invalidated, nothing to do
327 return self;
328 };
329
330 // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
331 // AND a = bar` doesn't make logical sense so we don't optimize this
332 // case
333 match existing.guarantee {
334 // knew that the column could not be a set of values
335 //
336 // For example, if we previously had `a != 5` and now we see
337 // another `AND a != 6` we know that a must not be either 5 or 6
338 // for the expression to be true
339 Guarantee::NotIn => {
340 let new_values: HashSet<_> = new_values.into_iter().collect();
341 existing.literals.extend(new_values.into_iter().cloned());
342 }
343 Guarantee::In => {
344 let intersection = new_values
345 .into_iter()
346 .filter(|new_value| existing.literals.contains(*new_value))
347 .collect::<Vec<_>>();
348 // for an In guarantee, if the intersection is not empty, we can extend the guarantee
349 // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
350 // otherwise, we invalidate the guarantee
351 // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
352 if !intersection.is_empty() {
353 existing.literals = intersection.into_iter().cloned().collect();
354 } else {
355 // at least one was not, so invalidate the guarantee
356 *entry = None;
357 }
358 }
359 }
360 } else {
361 // This is a new guarantee
362 let new_values: HashSet<_> = new_values.into_iter().collect();
363
364 let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
365 // add it to the list of guarantees
366 self.guarantees.push(Some(guarantee));
367 self.map.insert(key, self.guarantees.len() - 1);
368 }
369
370 self
371 }
372
373 /// Return all guarantees that have been created so far
374 fn build(self) -> Vec<LiteralGuarantee> {
375 // filter out any guarantees that have been invalidated
376 self.guarantees.into_iter().flatten().collect()
377 }
378}
379
380/// Represents a single `col [not]in literal` expression
381struct ColOpLit<'a> {
382 col: &'a crate::expressions::Column,
383 guarantee: Guarantee,
384 lit: &'a crate::expressions::Literal,
385}
386
387impl<'a> ColOpLit<'a> {
388 /// Returns Some(ColOpLit) if the expression is either:
389 /// 1. `col <op> literal`
390 /// 2. `literal <op> col`
391 /// 3. operator is `=` or `!=`
392 ///
393 /// Returns None otherwise
394 fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
395 let binary_expr = expr.downcast_ref::<crate::expressions::BinaryExpr>()?;
396
397 let (left, op, right) =
398 (binary_expr.left(), binary_expr.op(), binary_expr.right());
399 let guarantee = match op {
400 Operator::Eq => Guarantee::In,
401 Operator::NotEq => Guarantee::NotIn,
402 _ => return None,
403 };
404 // col <op> literal
405 if let (Some(col), Some(lit)) = (
406 left.downcast_ref::<crate::expressions::Column>(),
407 right.downcast_ref::<crate::expressions::Literal>(),
408 ) {
409 Some(Self {
410 col,
411 guarantee,
412 lit,
413 })
414 }
415 // literal <op> col
416 else if let (Some(lit), Some(col)) = (
417 left.downcast_ref::<crate::expressions::Literal>(),
418 right.downcast_ref::<crate::expressions::Column>(),
419 ) {
420 Some(Self {
421 col,
422 guarantee,
423 lit,
424 })
425 } else {
426 None
427 }
428 }
429}
430
431/// Represents a single `col [not]in literal` expression
432struct ColInList<'a> {
433 col: &'a crate::expressions::Column,
434 guarantee: Guarantee,
435 list: Vec<&'a crate::expressions::Literal>,
436}
437
438impl<'a> ColInList<'a> {
439 /// Returns Some(ColInList) if the expression is either:
440 /// 1. `col <op> (literal1, literal2, ...)`
441 /// 3. operator is `in` or `not in`
442 ///
443 /// Returns None otherwise
444 fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
445 // Only support single-column inlist currently, multi-column inlist is not supported
446 let col = inlist.expr().downcast_ref::<crate::expressions::Column>()?;
447
448 let literals = inlist
449 .list()
450 .iter()
451 .map(|e| e.downcast_ref::<crate::expressions::Literal>())
452 .collect::<Option<Vec<_>>>()?;
453
454 let guarantee = if inlist.negated() {
455 Guarantee::NotIn
456 } else {
457 Guarantee::In
458 };
459
460 Some(Self {
461 col,
462 guarantee,
463 list: literals,
464 })
465 }
466}
467
468/// Represents a single `col [not]in literal` expression or a single `col <op> literal` expression
469enum ColOpLitOrInList<'a> {
470 ColOpLit(ColOpLit<'a>),
471 ColInList(ColInList<'a>),
472}
473
474impl<'a> ColOpLitOrInList<'a> {
475 fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
476 match expr.downcast_ref::<crate::expressions::InListExpr>() {
477 Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
478 None => ColOpLit::try_new(expr).map(Self::ColOpLit),
479 }
480 }
481
482 fn guarantee(&self) -> Guarantee {
483 match self {
484 Self::ColOpLit(col_op_lit) => col_op_lit.guarantee,
485 Self::ColInList(col_in_list) => col_in_list.guarantee,
486 }
487 }
488
489 fn col(&self) -> &'a crate::expressions::Column {
490 match self {
491 Self::ColOpLit(col_op_lit) => col_op_lit.col,
492 Self::ColInList(col_in_list) => col_in_list.col,
493 }
494 }
495
496 fn lits(&self) -> Vec<&'a crate::expressions::Literal> {
497 match self {
498 Self::ColOpLit(col_op_lit) => vec![col_op_lit.lit],
499 Self::ColInList(col_in_list) => col_in_list.list.clone(),
500 }
501 }
502}
503
504/// Find columns that appear in all termsets
505fn find_common_columns<'a>(
506 termsets: &[Vec<ColOpLitOrInList<'a>>],
507) -> Vec<&'a crate::expressions::Column> {
508 if termsets.is_empty() {
509 return Vec::new();
510 }
511
512 // Start with columns from the first termset
513 let mut common_cols: HashSet<_> = termsets[0].iter().map(|term| term.col()).collect();
514
515 // check if any common_col in one termset occur many times
516 // e.g. (a = 1 AND a = 2) OR (a = 2 AND b = 3), should not infer a guarantee
517 // TODO: for above case, we can infer a IN (2) AND b IN (3)
518 if common_cols.len() != termsets[0].len() {
519 return Vec::new();
520 }
521
522 // Intersect with columns from remaining termsets
523 for termset in termsets.iter().skip(1) {
524 let termset_cols: HashSet<_> = termset.iter().map(|term| term.col()).collect();
525 if termset_cols.len() != termset.len() {
526 return Vec::new();
527 }
528 common_cols = common_cols.intersection(&termset_cols).cloned().collect();
529 }
530
531 common_cols.into_iter().collect()
532}
533
534#[cfg(test)]
535mod test {
536 use std::sync::LazyLock;
537
538 use super::*;
539 use crate::planner::logical2physical;
540
541 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
542 use datafusion_expr::expr_fn::*;
543 use datafusion_expr::{Expr, lit};
544
545 use itertools::Itertools;
546
547 #[test]
548 fn test_literal() {
549 // a single literal offers no guarantee
550 test_analyze(lit(true), vec![])
551 }
552
553 #[test]
554 fn test_single() {
555 // a = "foo"
556 test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
557 // "foo" = a
558 test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
559 // a != "foo"
560 test_analyze(
561 col("a").not_eq(lit("foo")),
562 vec![not_in_guarantee("a", ["foo"])],
563 );
564 // "foo" != a
565 test_analyze(
566 lit("foo").not_eq(col("a")),
567 vec![not_in_guarantee("a", ["foo"])],
568 );
569 }
570
571 #[test]
572 fn test_conjunction_single_column() {
573 // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
574 test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
575 // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
576 test_analyze(
577 col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
578 vec![
579 // can only be true of b is 1 and b is not 2 (even though it is redundant)
580 in_guarantee("b", [1]),
581 not_in_guarantee("b", [2]),
582 ],
583 );
584 // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
585 test_analyze(
586 col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
587 vec![
588 // can only be true of b is not 1 and b is 2 (even though it is redundant)
589 not_in_guarantee("b", [1]),
590 in_guarantee("b", [2]),
591 ],
592 );
593 // b != 1 AND b != 2
594 test_analyze(
595 col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
596 vec![not_in_guarantee("b", [1, 2])],
597 );
598 // b != 1 AND b != 2 and b != 3
599 test_analyze(
600 col("b")
601 .not_eq(lit(1))
602 .and(col("b").not_eq(lit(2)))
603 .and(col("b").not_eq(lit(3))),
604 vec![not_in_guarantee("b", [1, 2, 3])],
605 );
606 // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
607 test_analyze(
608 col("b")
609 .not_eq(lit(1))
610 .and(col("b").eq(lit(2)))
611 .and(col("b").not_eq(lit(3))),
612 vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
613 );
614 // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
615 test_analyze(
616 col("b")
617 .not_eq(lit(1))
618 .and(col("b").not_eq(lit(2)))
619 .and(col("b").eq(lit(3))),
620 vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
621 );
622 // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
623 test_analyze(
624 col("b")
625 .not_eq(lit(1))
626 .and(col("b").not_eq(lit(2)))
627 .and(col("b").gt(lit(3))),
628 vec![not_in_guarantee("b", [1, 2])],
629 );
630 }
631
632 #[test]
633 fn test_conjunction_multi_column() {
634 // a = "foo" AND b = 1
635 test_analyze(
636 col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
637 vec![
638 // should find both column guarantees
639 in_guarantee("a", ["foo"]),
640 in_guarantee("b", [1]),
641 ],
642 );
643 // a != "foo" AND b != 1
644 test_analyze(
645 col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
646 // should find both column guarantees
647 vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
648 );
649 // a = "foo" AND a = "bar"
650 test_analyze(
651 col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
652 // this predicate is impossible ( can't be both foo and bar),
653 vec![],
654 );
655 // a = "foo" AND b != "bar"
656 test_analyze(
657 col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
658 vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
659 );
660 // a != "foo" AND a != "bar"
661 test_analyze(
662 col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
663 // know it isn't "foo" or "bar"
664 vec![not_in_guarantee("a", ["foo", "bar"])],
665 );
666 // a != "foo" AND a != "bar" and a != "baz"
667 test_analyze(
668 col("a")
669 .not_eq(lit("foo"))
670 .and(col("a").not_eq(lit("bar")))
671 .and(col("a").not_eq(lit("baz"))),
672 // know it isn't "foo" or "bar" or "baz"
673 vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
674 );
675 // a = "foo" AND a = "foo"
676 let expr = col("a").eq(lit("foo"));
677 test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
678 // b > 5 AND b = 10 (should get an b = 10 guarantee)
679 test_analyze(
680 col("b").gt(lit(5)).and(col("b").eq(lit(10))),
681 vec![in_guarantee("b", [10])],
682 );
683 // b > 10 AND b = 10 (this is impossible)
684 test_analyze(
685 col("b").gt(lit(10)).and(col("b").eq(lit(10))),
686 vec![
687 // if b isn't 10, it can not be true (though the expression actually can never be true)
688 in_guarantee("b", [10]),
689 ],
690 );
691 // a != "foo" and (a != "bar" OR a != "baz")
692 test_analyze(
693 col("a")
694 .not_eq(lit("foo"))
695 .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
696 // a is not foo (we can't represent other knowledge about a)
697 vec![not_in_guarantee("a", ["foo"])],
698 );
699 }
700
701 #[test]
702 fn test_conjunction_and_disjunction_single_column() {
703 // b != 1 AND (b > 2)
704 test_analyze(
705 col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
706 vec![
707 // for the expression to be true, b can not be one
708 not_in_guarantee("b", [1]),
709 ],
710 );
711
712 // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
713 test_analyze(
714 col("b")
715 .eq(lit(1))
716 .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
717 vec![
718 // in theory, b must be 1 and one of 2,3 for this expression to be true
719 // which is a logical contradiction
720 ],
721 );
722 }
723
724 #[test]
725 fn test_disjunction_single_column() {
726 // b = 1 OR b = 2
727 test_analyze(
728 col("b").eq(lit(1)).or(col("b").eq(lit(2))),
729 vec![in_guarantee("b", [1, 2])],
730 );
731 // b != 1 OR b = 2
732 test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
733 // b = 1 OR b != 2
734 test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
735 // b != 1 OR b != 2
736 test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
737 // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
738 test_analyze(
739 col("b")
740 .not_eq(lit(1))
741 .or(col("b").not_eq(lit(2)))
742 .or(lit("b").eq(lit(3))),
743 vec![],
744 );
745 // b = 1 OR b = 2 OR b = 3
746 test_analyze(
747 col("b")
748 .eq(lit(1))
749 .or(col("b").eq(lit(2)))
750 .or(col("b").eq(lit(3))),
751 vec![in_guarantee("b", [1, 2, 3])],
752 );
753 // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
754 test_analyze(
755 col("b")
756 .eq(lit(1))
757 .or(col("b").eq(lit(2)))
758 .or(lit("b").eq(lit(3))),
759 vec![],
760 );
761 }
762
763 #[test]
764 fn test_disjunction_multi_column() {
765 // a = "foo" OR b = 1
766 test_analyze(
767 col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
768 // no can't have a single column guarantee (if a = "foo" then b != 1) etc
769 vec![],
770 );
771 // a != "foo" OR b != 1
772 test_analyze(
773 col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
774 // No single column guarantee
775 vec![],
776 );
777 // a = "foo" OR a = "bar"
778 test_analyze(
779 col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
780 vec![in_guarantee("a", ["foo", "bar"])],
781 );
782 // a = "foo" OR a = "foo"
783 test_analyze(
784 col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
785 vec![in_guarantee("a", ["foo"])],
786 );
787 // a != "foo" OR a != "bar"
788 test_analyze(
789 col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
790 // can't represent knowledge about a in this case
791 vec![],
792 );
793 // a = "foo" OR a = "bar" OR a = "baz"
794 test_analyze(
795 col("a")
796 .eq(lit("foo"))
797 .or(col("a").eq(lit("bar")))
798 .or(col("a").eq(lit("baz"))),
799 vec![in_guarantee("a", ["foo", "bar", "baz"])],
800 );
801 // (a = "foo" OR a = "bar") AND (a = "baz)"
802 test_analyze(
803 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
804 .and(col("a").eq(lit("baz"))),
805 // this could potentially be represented as 2 constraints with a more
806 // sophisticated analysis
807 vec![],
808 );
809 // (a = "foo" OR a = "bar") AND (b = 1)
810 test_analyze(
811 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
812 .and(col("b").eq(lit(1))),
813 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
814 );
815 // (a = "foo" OR a = "bar") OR (b = 1)
816 test_analyze(
817 col("a")
818 .eq(lit("foo"))
819 .or(col("a").eq(lit("bar")))
820 .or(col("b").eq(lit(1))),
821 // can't represent knowledge about a or b in this case
822 vec![],
823 );
824 }
825
826 #[test]
827 fn test_single_inlist() {
828 // b IN (1, 2, 3)
829 test_analyze(
830 col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
831 vec![in_guarantee("b", [1, 2, 3])],
832 );
833 // b NOT IN (1, 2, 3)
834 test_analyze(
835 col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
836 vec![not_in_guarantee("b", [1, 2, 3])],
837 );
838 // b IN (1,2,3,4...24)
839 test_analyze(
840 col("b").in_list((1..25).map(lit).collect_vec(), false),
841 vec![in_guarantee("b", 1..25)],
842 );
843 }
844
845 #[test]
846 fn test_inlist_conjunction() {
847 // b IN (1, 2, 3) AND b IN (2, 3, 4)
848 test_analyze(
849 col("b")
850 .in_list(vec![lit(1), lit(2), lit(3)], false)
851 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
852 vec![in_guarantee("b", [2, 3])],
853 );
854 // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
855 test_analyze(
856 col("b")
857 .in_list(vec![lit(1), lit(2), lit(3)], true)
858 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
859 vec![
860 not_in_guarantee("b", [1, 2, 3]),
861 in_guarantee("b", [2, 3, 4]),
862 ],
863 );
864 // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
865 test_analyze(
866 col("b")
867 .in_list(vec![lit(1), lit(2), lit(3)], true)
868 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
869 vec![not_in_guarantee("b", [1, 2, 3, 4])],
870 );
871 // b IN (1, 2, 3) AND b = 4
872 test_analyze(
873 col("b")
874 .in_list(vec![lit(1), lit(2), lit(3)], false)
875 .and(col("b").eq(lit(4))),
876 vec![],
877 );
878 // b IN (1, 2, 3) AND b = 2
879 test_analyze(
880 col("b")
881 .in_list(vec![lit(1), lit(2), lit(3)], false)
882 .and(col("b").eq(lit(2))),
883 vec![in_guarantee("b", [2])],
884 );
885 // b IN (1, 2, 3) AND b != 2
886 test_analyze(
887 col("b")
888 .in_list(vec![lit(1), lit(2), lit(3)], false)
889 .and(col("b").not_eq(lit(2))),
890 vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
891 );
892 // b NOT IN (1, 2, 3) AND b != 4
893 test_analyze(
894 col("b")
895 .in_list(vec![lit(1), lit(2), lit(3)], true)
896 .and(col("b").not_eq(lit(4))),
897 vec![not_in_guarantee("b", [1, 2, 3, 4])],
898 );
899 // b NOT IN (1, 2, 3) AND b != 2
900 test_analyze(
901 col("b")
902 .in_list(vec![lit(1), lit(2), lit(3)], true)
903 .and(col("b").not_eq(lit(2))),
904 vec![not_in_guarantee("b", [1, 2, 3])],
905 );
906 }
907
908 #[test]
909 fn test_inlist_with_disjunction() {
910 // b IN (1, 2, 3) AND (b = 3 OR b = 4)
911 test_analyze(
912 col("b")
913 .in_list(vec![lit(1), lit(2), lit(3)], false)
914 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
915 vec![in_guarantee("b", [3])],
916 );
917 // b IN (1, 2, 3) AND (b = 4 OR b = 5)
918 test_analyze(
919 col("b")
920 .in_list(vec![lit(1), lit(2), lit(3)], false)
921 .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
922 vec![],
923 );
924 // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
925 test_analyze(
926 col("b")
927 .in_list(vec![lit(1), lit(2), lit(3)], true)
928 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
929 vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
930 );
931 // b IN (1, 2, 3) OR b = 2
932 test_analyze(
933 col("b")
934 .in_list(vec![lit(1), lit(2), lit(3)], false)
935 .or(col("b").eq(lit(2))),
936 vec![in_guarantee("b", [1, 2, 3])],
937 );
938 // b IN (1, 2, 3) OR b != 3
939 test_analyze(
940 col("b")
941 .in_list(vec![lit(1), lit(2), lit(3)], false)
942 .or(col("b").not_eq(lit(3))),
943 vec![],
944 );
945 }
946
947 #[test]
948 fn test_disjunction_and_conjunction_multi_column() {
949 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2)
950 test_analyze(
951 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
952 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
953 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1, 2])],
954 );
955 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (b = 3)
956 test_analyze(
957 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
958 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
959 .or(col("b").eq(lit(3))),
960 vec![in_guarantee("b", [1, 2, 3])],
961 );
962 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (c = 3)
963 test_analyze(
964 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
965 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
966 .or(col("c").eq(lit(3))),
967 vec![],
968 );
969 // (a = "foo" AND b > 1) OR (a = "bar" AND b = 2)
970 test_analyze(
971 (col("a").eq(lit("foo")).and(col("b").gt(lit(1))))
972 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
973 vec![in_guarantee("a", ["foo", "bar"])],
974 );
975 // (a = "foo" AND b = 1) OR (b = 1 AND c = 2) OR (c = 3 AND a = "bar")
976 test_analyze(
977 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
978 .or(col("b").eq(lit(1)).and(col("c").eq(lit(2))))
979 .or(col("c").eq(lit(3)).and(col("a").eq(lit("bar")))),
980 vec![],
981 );
982 // (a = "foo" AND a = "bar") OR (a = "good" AND b = 1)
983 // TODO: this should be `a IN ("good") AND b IN (1)`
984 test_analyze(
985 (col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))))
986 .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
987 vec![],
988 );
989 // (a = "foo" AND a = "foo") OR (a = "good" AND b = 1)
990 // TODO: this should be `a IN ("foo", "good")`
991 test_analyze(
992 (col("a").eq(lit("foo")).and(col("a").eq(lit("foo"))))
993 .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
994 vec![],
995 );
996 // (a = "foo" AND b = 3) OR (b = 4 AND b = 1) OR (b = 2 AND a = "bar")
997 test_analyze(
998 (col("a").eq(lit("foo")).and(col("b").eq(lit(3))))
999 .or(col("b").eq(lit(4)).and(col("b").eq(lit(1))))
1000 .or(col("b").eq(lit(2)).and(col("a").eq(lit("bar")))),
1001 vec![],
1002 );
1003 // (b = 1 AND b > 3) OR (a = "foo" AND b = 4)
1004 test_analyze(
1005 (col("b").eq(lit(1)).and(col("b").gt(lit(3))))
1006 .or(col("a").eq(lit("foo")).and(col("b").eq(lit(4)))),
1007 // if b isn't 1 or 4, it can not be true (though the expression actually can never be true)
1008 vec![in_guarantee("b", [1, 4])],
1009 );
1010 // (a = "foo" AND b = 1) OR (a != "bar" AND b = 2)
1011 test_analyze(
1012 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1013 .or(col("a").not_eq(lit("bar")).and(col("b").eq(lit(2)))),
1014 vec![in_guarantee("b", [1, 2])],
1015 );
1016 // (a = "foo" AND b = 1) OR (a LIKE "%bar" AND b = 2)
1017 test_analyze(
1018 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1019 .or(col("a").like(lit("%bar")).and(col("b").eq(lit(2)))),
1020 vec![in_guarantee("b", [1, 2])],
1021 );
1022 // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo", "bar") AND b = 6)
1023 test_analyze(
1024 (col("a")
1025 .in_list(vec![lit("foo"), lit("bar")], false)
1026 .and(col("b").eq(lit(5))))
1027 .or(col("a")
1028 .in_list(vec![lit("foo"), lit("bar")], false)
1029 .and(col("b").eq(lit(6)))),
1030 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1031 );
1032 // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo") AND b = 6)
1033 test_analyze(
1034 (col("a")
1035 .in_list(vec![lit("foo"), lit("bar")], false)
1036 .and(col("b").eq(lit(5))))
1037 .or(col("a")
1038 .in_list(vec![lit("foo")], false)
1039 .and(col("b").eq(lit(6)))),
1040 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1041 );
1042 // (a NOT IN ("foo", "bar") AND b = 5) OR (a NOT IN ("foo") AND b = 6)
1043 test_analyze(
1044 (col("a")
1045 .in_list(vec![lit("foo"), lit("bar")], true)
1046 .and(col("b").eq(lit(5))))
1047 .or(col("a")
1048 .in_list(vec![lit("foo")], true)
1049 .and(col("b").eq(lit(6)))),
1050 vec![in_guarantee("b", [5, 6])],
1051 );
1052 }
1053
1054 /// Tests that analyzing expr results in the expected guarantees
1055 fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
1056 println!("Begin analyze of {expr}");
1057 let schema = schema();
1058 let physical_expr = logical2physical(&expr, &schema);
1059
1060 let actual = LiteralGuarantee::analyze(&physical_expr)
1061 .into_iter()
1062 .sorted_by_key(|g| g.column.name().to_string())
1063 .collect::<Vec<_>>();
1064 assert_eq!(
1065 expected, actual,
1066 "expr: {expr}\
1067 \n\nexpected: {expected:#?}\
1068 \n\nactual: {actual:#?}\
1069 \n\nexpr: {expr:#?}\
1070 \n\nphysical_expr: {physical_expr:#?}"
1071 );
1072 }
1073
1074 /// Guarantee that the expression is true if the column is one of the specified values
1075 fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1076 where
1077 I: IntoIterator<Item = S>,
1078 S: Into<ScalarValue> + 'a,
1079 {
1080 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1081 LiteralGuarantee::new(column, Guarantee::In, literals.iter())
1082 }
1083
1084 /// Guarantee that the expression is true if the column is NOT any of the specified values
1085 fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1086 where
1087 I: IntoIterator<Item = S>,
1088 S: Into<ScalarValue> + 'a,
1089 {
1090 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1091 LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
1092 }
1093
1094 // Schema for testing
1095 fn schema() -> SchemaRef {
1096 static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
1097 Arc::new(Schema::new(vec![
1098 Field::new("a", DataType::Utf8, false),
1099 Field::new("b", DataType::Int32, false),
1100 Field::new("c", DataType::Int32, false),
1101 ]))
1102 });
1103 Arc::clone(&SCHEMA)
1104 }
1105}