datafusion_physical_expr/utils/
guarantee.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{split_conjunction, PhysicalExpr};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65///    column takes on any other value, the predicate can not evaluate to `true`.
66///    For example,
67///    `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70///    If the column can ONLY take one of these values, the predicate can not
71///    evaluate to `true`. For example,
72///    `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75    pub column: Column,
76    pub guarantee: Guarantee,
77    pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83    /// Guarantee that the expression is `true` if `column` is one of the values. If
84    /// `column` is not one of the values, the expression can not be `true`.
85    In,
86    /// Guarantee that the expression is `true` if `column` is not ANY of the
87    /// values. If `column` only takes one of these values, the expression can
88    /// not be `true`.
89    NotIn,
90}
91
92impl LiteralGuarantee {
93    /// Create a new instance of the guarantee if the provided operator is
94    /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95    /// create these structures from an predicate (boolean expression).
96    fn new<'a>(
97        column_name: impl Into<String>,
98        guarantee: Guarantee,
99        literals: impl IntoIterator<Item = &'a ScalarValue>,
100    ) -> Self {
101        let literals: HashSet<_> = literals.into_iter().cloned().collect();
102
103        Self {
104            column: Column::from_name(column_name),
105            guarantee,
106            literals,
107        }
108    }
109
110    /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
111    /// to evaluate to `true`.
112    ///
113    /// If more than one `LiteralGuarantee` is returned, they must **all** hold
114    /// for the expression to possibly be `true`. If any is not satisfied, the
115    /// expression is guaranteed to be `null` or `false`.
116    ///
117    /// # Notes:
118    /// 1. `expr` must be a boolean expression or inlist expression.
119    /// 2. `expr` is not simplified prior to analysis.
120    pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
121        // split conjunction: <expr> AND <expr> AND ...
122        split_conjunction(expr)
123            .into_iter()
124            // for an `AND` conjunction to be true, all terms individually must be true
125            .fold(GuaranteeBuilder::new(), |builder, expr| {
126                if let Some(cel) = ColOpLit::try_new(expr) {
127                    builder.aggregate_conjunct(cel)
128                } else if let Some(inlist) = expr
129                    .as_any()
130                    .downcast_ref::<crate::expressions::InListExpr>()
131                {
132                    if let Some(inlist) = ColInList::try_new(inlist) {
133                        builder.aggregate_multi_conjunct(
134                            inlist.col,
135                            inlist.guarantee,
136                            inlist.list.iter().map(|lit| lit.value()),
137                        )
138                    } else {
139                        builder
140                    }
141                } else {
142                    // split disjunction: <expr> OR <expr> OR ...
143                    let disjunctions = split_disjunction(expr);
144
145                    // We are trying to add a guarantee that a column must be
146                    // in/not in a particular set of values for the expression
147                    // to evaluate to true.
148                    //
149                    // A disjunction is true, if at least one of the terms is be
150                    // true.
151                    //
152                    // Thus, we can infer a guarantee if all terms are of the
153                    // form `(col <op> literal) OR (col <op> literal) OR ...`.
154                    //
155                    // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
156                    // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
157                    //
158                    // However, for something like  `a = 1 OR a = 2 OR a < 0` we
159                    // **can't** guarantee that the predicate is only true if a
160                    // is in (`1`, `2`), as it could also be true if `a` were less
161                    // than zero.
162                    let terms = disjunctions
163                        .iter()
164                        .filter_map(|expr| ColOpLit::try_new(expr))
165                        .collect::<Vec<_>>();
166
167                    // if all terms are 'col <op> literal' with the same column
168                    // and operation we can infer any guarantees
169                    //
170                    // For those like (a != foo AND (a != bar OR a != baz)).
171                    // We can't combine the (a != bar OR a != baz) part, but
172                    // it also doesn't invalidate our knowledge that a !=
173                    // foo is required for the expression to be true.
174                    // So we can only create a multi value guarantee for `=`
175                    // (or a single value). (e.g. ignore `a != foo OR a != bar`)
176                    let first_term = terms.first();
177                    if !terms.is_empty()
178                        && terms.len() == disjunctions.len()
179                        && terms.iter().all(|term| {
180                            term.col.name() == first_term.unwrap().col.name()
181                                && term.guarantee == Guarantee::In
182                        })
183                    {
184                        builder.aggregate_multi_conjunct(
185                            first_term.unwrap().col,
186                            Guarantee::In,
187                            terms.iter().map(|term| term.lit.value()),
188                        )
189                    } else {
190                        // Handle disjunctions with conjunctions like (a = 1 AND b = 2) OR (a = 2 AND b = 3)
191                        // Extract termsets from each disjunction
192                        // if in each termset, they have same column, and the guarantee is In,
193                        // we can infer a guarantee for the column
194                        // e.g. (a = 1 AND b = 2) OR (a = 2 AND b = 3) is `a IN (1, 2) AND b IN (2, 3)`
195                        // otherwise, we can't infer a guarantee
196                        let termsets: Vec<Vec<ColOpLitOrInList>> = disjunctions
197                            .iter()
198                            .map(|expr| {
199                                split_conjunction(expr)
200                                    .into_iter()
201                                    .filter_map(ColOpLitOrInList::try_new)
202                                    .filter(|term| term.guarantee() == Guarantee::In)
203                                    .collect()
204                            })
205                            .collect();
206
207                        // Early return if any termset is empty (can't infer guarantees)
208                        if termsets.iter().any(|terms| terms.is_empty()) {
209                            return builder;
210                        }
211
212                        // Find columns that appear in all termsets
213                        let common_cols = find_common_columns(&termsets);
214                        if common_cols.is_empty() {
215                            return builder;
216                        }
217
218                        // Build guarantees for common columns
219                        let mut builder = builder;
220                        for col in common_cols {
221                            let literals: Vec<_> = termsets
222                                .iter()
223                                .filter_map(|terms| {
224                                    terms.iter().find(|term| term.col() == col).map(
225                                        |term| {
226                                            term.lits().into_iter().map(|lit| lit.value())
227                                        },
228                                    )
229                                })
230                                .flatten()
231                                .collect();
232
233                            builder = builder.aggregate_multi_conjunct(
234                                col,
235                                Guarantee::In,
236                                literals.into_iter(),
237                            );
238                        }
239
240                        builder
241                    }
242                }
243            })
244            .build()
245    }
246}
247
248impl Display for LiteralGuarantee {
249    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
250        let mut sorted_literals: Vec<_> =
251            self.literals.iter().map(|lit| lit.to_string()).collect();
252        sorted_literals.sort();
253        match self.guarantee {
254            Guarantee::In => write!(
255                f,
256                "{} in ({})",
257                self.column.name,
258                sorted_literals.join(", ")
259            ),
260            Guarantee::NotIn => write!(
261                f,
262                "{} not in ({})",
263                self.column.name,
264                sorted_literals.join(", ")
265            ),
266        }
267    }
268}
269
270/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
271/// preserving insert order
272#[derive(Debug, Default)]
273struct GuaranteeBuilder<'a> {
274    /// List of guarantees that have been created so far
275    /// if we have determined a subsequent conjunct invalidates a guarantee
276    /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
277    guarantees: Vec<Option<LiteralGuarantee>>,
278
279    /// Key is the (column name, guarantee type)
280    /// Value is the index into `guarantees`
281    map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
282}
283
284impl<'a> GuaranteeBuilder<'a> {
285    fn new() -> Self {
286        Default::default()
287    }
288
289    /// Aggregate a new single `AND col <op> literal` term to this builder
290    /// combining with existing guarantees if possible.
291    ///
292    /// # Examples
293    /// * `AND (a = 1)`: `a` is guaranteed to be 1
294    /// * `AND (a != 1)`: a is guaranteed to not be 1
295    fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
296        self.aggregate_multi_conjunct(
297            col_op_lit.col,
298            col_op_lit.guarantee,
299            [col_op_lit.lit.value()],
300        )
301    }
302
303    /// Aggregates a new single column, multi literal term to this builder
304    /// combining with previously known guarantees if possible.
305    ///
306    /// # Examples
307    /// For the following examples, we can guarantee the expression is `true` if:
308    /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
309    /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
310    /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
311    /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
312    fn aggregate_multi_conjunct(
313        mut self,
314        col: &'a crate::expressions::Column,
315        guarantee: Guarantee,
316        new_values: impl IntoIterator<Item = &'a ScalarValue>,
317    ) -> Self {
318        let key = (col, guarantee);
319        if let Some(index) = self.map.get(&key) {
320            // already have a guarantee for this column
321            let entry = &mut self.guarantees[*index];
322
323            let Some(existing) = entry else {
324                // determined the previous guarantee for this column has been
325                // invalidated, nothing to do
326                return self;
327            };
328
329            // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
330            // AND a = bar` doesn't make logical sense so we don't optimize this
331            // case
332            match existing.guarantee {
333                // knew that the column could not be a set of values
334                //
335                // For example, if we previously had `a != 5` and now we see
336                // another `AND a != 6` we know that a must not be either 5 or 6
337                // for the expression to be true
338                Guarantee::NotIn => {
339                    let new_values: HashSet<_> = new_values.into_iter().collect();
340                    existing.literals.extend(new_values.into_iter().cloned());
341                }
342                Guarantee::In => {
343                    let intersection = new_values
344                        .into_iter()
345                        .filter(|new_value| existing.literals.contains(*new_value))
346                        .collect::<Vec<_>>();
347                    // for an In guarantee, if the intersection is not empty,  we can extend the guarantee
348                    // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
349                    // otherwise, we invalidate the guarantee
350                    // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
351                    if !intersection.is_empty() {
352                        existing.literals = intersection.into_iter().cloned().collect();
353                    } else {
354                        // at least one was not, so invalidate the guarantee
355                        *entry = None;
356                    }
357                }
358            }
359        } else {
360            // This is a new guarantee
361            let new_values: HashSet<_> = new_values.into_iter().collect();
362
363            let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
364            // add it to the list of guarantees
365            self.guarantees.push(Some(guarantee));
366            self.map.insert(key, self.guarantees.len() - 1);
367        }
368
369        self
370    }
371
372    /// Return all guarantees that have been created so far
373    fn build(self) -> Vec<LiteralGuarantee> {
374        // filter out any guarantees that have been invalidated
375        self.guarantees.into_iter().flatten().collect()
376    }
377}
378
379/// Represents a single `col [not]in literal` expression
380struct ColOpLit<'a> {
381    col: &'a crate::expressions::Column,
382    guarantee: Guarantee,
383    lit: &'a crate::expressions::Literal,
384}
385
386impl<'a> ColOpLit<'a> {
387    /// Returns Some(ColOpLit) if the expression is either:
388    /// 1. `col <op> literal`
389    /// 2. `literal <op> col`
390    /// 3. operator is `=` or `!=`
391    ///
392    /// Returns None otherwise
393    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
394        let binary_expr = expr
395            .as_any()
396            .downcast_ref::<crate::expressions::BinaryExpr>()?;
397
398        let (left, op, right) = (
399            binary_expr.left().as_any(),
400            binary_expr.op(),
401            binary_expr.right().as_any(),
402        );
403        let guarantee = match op {
404            Operator::Eq => Guarantee::In,
405            Operator::NotEq => Guarantee::NotIn,
406            _ => return None,
407        };
408        // col <op> literal
409        if let (Some(col), Some(lit)) = (
410            left.downcast_ref::<crate::expressions::Column>(),
411            right.downcast_ref::<crate::expressions::Literal>(),
412        ) {
413            Some(Self {
414                col,
415                guarantee,
416                lit,
417            })
418        }
419        // literal <op> col
420        else if let (Some(lit), Some(col)) = (
421            left.downcast_ref::<crate::expressions::Literal>(),
422            right.downcast_ref::<crate::expressions::Column>(),
423        ) {
424            Some(Self {
425                col,
426                guarantee,
427                lit,
428            })
429        } else {
430            None
431        }
432    }
433}
434
435/// Represents a single `col [not]in literal` expression
436struct ColInList<'a> {
437    col: &'a crate::expressions::Column,
438    guarantee: Guarantee,
439    list: Vec<&'a crate::expressions::Literal>,
440}
441
442impl<'a> ColInList<'a> {
443    /// Returns Some(ColInList) if the expression is either:
444    /// 1. `col <op> (literal1, literal2, ...)`
445    /// 3. operator is `in` or `not in`
446    ///
447    /// Returns None otherwise
448    fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
449        // Only support single-column inlist currently, multi-column inlist is not supported
450        let col = inlist
451            .expr()
452            .as_any()
453            .downcast_ref::<crate::expressions::Column>()?;
454
455        let literals = inlist
456            .list()
457            .iter()
458            .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
459            .collect::<Option<Vec<_>>>()?;
460
461        let guarantee = if inlist.negated() {
462            Guarantee::NotIn
463        } else {
464            Guarantee::In
465        };
466
467        Some(Self {
468            col,
469            guarantee,
470            list: literals,
471        })
472    }
473}
474
475/// Represents a single `col [not]in literal` expression or a single `col <op> literal` expression
476enum ColOpLitOrInList<'a> {
477    ColOpLit(ColOpLit<'a>),
478    ColInList(ColInList<'a>),
479}
480
481impl<'a> ColOpLitOrInList<'a> {
482    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
483        match expr
484            .as_any()
485            .downcast_ref::<crate::expressions::InListExpr>()
486        {
487            Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
488            None => ColOpLit::try_new(expr).map(Self::ColOpLit),
489        }
490    }
491
492    fn guarantee(&self) -> Guarantee {
493        match self {
494            Self::ColOpLit(col_op_lit) => col_op_lit.guarantee,
495            Self::ColInList(col_in_list) => col_in_list.guarantee,
496        }
497    }
498
499    fn col(&self) -> &'a crate::expressions::Column {
500        match self {
501            Self::ColOpLit(col_op_lit) => col_op_lit.col,
502            Self::ColInList(col_in_list) => col_in_list.col,
503        }
504    }
505
506    fn lits(&self) -> Vec<&'a crate::expressions::Literal> {
507        match self {
508            Self::ColOpLit(col_op_lit) => vec![col_op_lit.lit],
509            Self::ColInList(col_in_list) => col_in_list.list.clone(),
510        }
511    }
512}
513
514/// Find columns that appear in all termsets
515fn find_common_columns<'a>(
516    termsets: &[Vec<ColOpLitOrInList<'a>>],
517) -> Vec<&'a crate::expressions::Column> {
518    if termsets.is_empty() {
519        return Vec::new();
520    }
521
522    // Start with columns from the first termset
523    let mut common_cols: HashSet<_> = termsets[0].iter().map(|term| term.col()).collect();
524
525    // check if any common_col in one termset occur many times
526    // e.g. (a = 1 AND a = 2) OR (a = 2 AND b = 3), should not infer a guarantee
527    // TODO: for above case, we can infer a IN (2) AND b IN (3)
528    if common_cols.len() != termsets[0].len() {
529        return Vec::new();
530    }
531
532    // Intersect with columns from remaining termsets
533    for termset in termsets.iter().skip(1) {
534        let termset_cols: HashSet<_> = termset.iter().map(|term| term.col()).collect();
535        if termset_cols.len() != termset.len() {
536            return Vec::new();
537        }
538        common_cols = common_cols.intersection(&termset_cols).cloned().collect();
539    }
540
541    common_cols.into_iter().collect()
542}
543
544#[cfg(test)]
545mod test {
546    use std::sync::LazyLock;
547
548    use super::*;
549    use crate::planner::logical2physical;
550
551    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
552    use datafusion_expr::expr_fn::*;
553    use datafusion_expr::{lit, Expr};
554
555    use itertools::Itertools;
556
557    #[test]
558    fn test_literal() {
559        // a single literal offers no guarantee
560        test_analyze(lit(true), vec![])
561    }
562
563    #[test]
564    fn test_single() {
565        // a = "foo"
566        test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
567        // "foo" = a
568        test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
569        // a != "foo"
570        test_analyze(
571            col("a").not_eq(lit("foo")),
572            vec![not_in_guarantee("a", ["foo"])],
573        );
574        // "foo" != a
575        test_analyze(
576            lit("foo").not_eq(col("a")),
577            vec![not_in_guarantee("a", ["foo"])],
578        );
579    }
580
581    #[test]
582    fn test_conjunction_single_column() {
583        // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
584        test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
585        // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
586        test_analyze(
587            col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
588            vec![
589                // can only be true of b is 1 and b is not 2 (even though it is redundant)
590                in_guarantee("b", [1]),
591                not_in_guarantee("b", [2]),
592            ],
593        );
594        // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
595        test_analyze(
596            col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
597            vec![
598                // can only be true of b is not 1 and b is 2 (even though it is redundant)
599                not_in_guarantee("b", [1]),
600                in_guarantee("b", [2]),
601            ],
602        );
603        // b != 1 AND b != 2
604        test_analyze(
605            col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
606            vec![not_in_guarantee("b", [1, 2])],
607        );
608        // b != 1 AND b != 2 and b != 3
609        test_analyze(
610            col("b")
611                .not_eq(lit(1))
612                .and(col("b").not_eq(lit(2)))
613                .and(col("b").not_eq(lit(3))),
614            vec![not_in_guarantee("b", [1, 2, 3])],
615        );
616        // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
617        test_analyze(
618            col("b")
619                .not_eq(lit(1))
620                .and(col("b").eq(lit(2)))
621                .and(col("b").not_eq(lit(3))),
622            vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
623        );
624        // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
625        test_analyze(
626            col("b")
627                .not_eq(lit(1))
628                .and(col("b").not_eq(lit(2)))
629                .and(col("b").eq(lit(3))),
630            vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
631        );
632        // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
633        test_analyze(
634            col("b")
635                .not_eq(lit(1))
636                .and(col("b").not_eq(lit(2)))
637                .and(col("b").gt(lit(3))),
638            vec![not_in_guarantee("b", [1, 2])],
639        );
640    }
641
642    #[test]
643    fn test_conjunction_multi_column() {
644        // a = "foo" AND b = 1
645        test_analyze(
646            col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
647            vec![
648                // should find both column guarantees
649                in_guarantee("a", ["foo"]),
650                in_guarantee("b", [1]),
651            ],
652        );
653        // a != "foo" AND b != 1
654        test_analyze(
655            col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
656            // should find both column guarantees
657            vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
658        );
659        // a = "foo" AND a = "bar"
660        test_analyze(
661            col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
662            // this predicate is impossible ( can't be both foo and bar),
663            vec![],
664        );
665        // a = "foo" AND b != "bar"
666        test_analyze(
667            col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
668            vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
669        );
670        // a != "foo" AND a != "bar"
671        test_analyze(
672            col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
673            // know it isn't "foo" or "bar"
674            vec![not_in_guarantee("a", ["foo", "bar"])],
675        );
676        // a != "foo" AND a != "bar" and a != "baz"
677        test_analyze(
678            col("a")
679                .not_eq(lit("foo"))
680                .and(col("a").not_eq(lit("bar")))
681                .and(col("a").not_eq(lit("baz"))),
682            // know it isn't "foo" or "bar" or "baz"
683            vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
684        );
685        // a = "foo" AND a = "foo"
686        let expr = col("a").eq(lit("foo"));
687        test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
688        // b > 5 AND b = 10 (should get an b = 10 guarantee)
689        test_analyze(
690            col("b").gt(lit(5)).and(col("b").eq(lit(10))),
691            vec![in_guarantee("b", [10])],
692        );
693        // b > 10 AND b = 10 (this is impossible)
694        test_analyze(
695            col("b").gt(lit(10)).and(col("b").eq(lit(10))),
696            vec![
697                //  if b isn't 10, it can not be true (though the expression actually can never be true)
698                in_guarantee("b", [10]),
699            ],
700        );
701        // a != "foo" and (a != "bar" OR a != "baz")
702        test_analyze(
703            col("a")
704                .not_eq(lit("foo"))
705                .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
706            // a is not foo (we can't represent other knowledge about a)
707            vec![not_in_guarantee("a", ["foo"])],
708        );
709    }
710
711    #[test]
712    fn test_conjunction_and_disjunction_single_column() {
713        // b != 1 AND (b > 2)
714        test_analyze(
715            col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
716            vec![
717                // for the expression to be true, b can not be one
718                not_in_guarantee("b", [1]),
719            ],
720        );
721
722        // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
723        test_analyze(
724            col("b")
725                .eq(lit(1))
726                .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
727            vec![
728                // in theory, b must be 1 and one of 2,3 for this expression to be true
729                // which is a logical contradiction
730            ],
731        );
732    }
733
734    #[test]
735    fn test_disjunction_single_column() {
736        // b = 1 OR b = 2
737        test_analyze(
738            col("b").eq(lit(1)).or(col("b").eq(lit(2))),
739            vec![in_guarantee("b", [1, 2])],
740        );
741        // b != 1 OR b = 2
742        test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
743        // b = 1 OR b != 2
744        test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
745        // b != 1 OR b != 2
746        test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
747        // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
748        test_analyze(
749            col("b")
750                .not_eq(lit(1))
751                .or(col("b").not_eq(lit(2)))
752                .or(lit("b").eq(lit(3))),
753            vec![],
754        );
755        // b = 1 OR b = 2 OR b = 3
756        test_analyze(
757            col("b")
758                .eq(lit(1))
759                .or(col("b").eq(lit(2)))
760                .or(col("b").eq(lit(3))),
761            vec![in_guarantee("b", [1, 2, 3])],
762        );
763        // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
764        test_analyze(
765            col("b")
766                .eq(lit(1))
767                .or(col("b").eq(lit(2)))
768                .or(lit("b").eq(lit(3))),
769            vec![],
770        );
771    }
772
773    #[test]
774    fn test_disjunction_multi_column() {
775        // a = "foo" OR b = 1
776        test_analyze(
777            col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
778            // no can't have a single column guarantee (if a = "foo" then b != 1) etc
779            vec![],
780        );
781        // a != "foo" OR b != 1
782        test_analyze(
783            col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
784            // No single column guarantee
785            vec![],
786        );
787        // a = "foo" OR a = "bar"
788        test_analyze(
789            col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
790            vec![in_guarantee("a", ["foo", "bar"])],
791        );
792        // a = "foo" OR a = "foo"
793        test_analyze(
794            col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
795            vec![in_guarantee("a", ["foo"])],
796        );
797        // a != "foo" OR a != "bar"
798        test_analyze(
799            col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
800            // can't represent knowledge about a in this case
801            vec![],
802        );
803        // a = "foo" OR a = "bar" OR a = "baz"
804        test_analyze(
805            col("a")
806                .eq(lit("foo"))
807                .or(col("a").eq(lit("bar")))
808                .or(col("a").eq(lit("baz"))),
809            vec![in_guarantee("a", ["foo", "bar", "baz"])],
810        );
811        // (a = "foo" OR a = "bar") AND (a = "baz)"
812        test_analyze(
813            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
814                .and(col("a").eq(lit("baz"))),
815            // this could potentially be represented as 2 constraints with a more
816            // sophisticated analysis
817            vec![],
818        );
819        // (a = "foo" OR a = "bar") AND (b = 1)
820        test_analyze(
821            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
822                .and(col("b").eq(lit(1))),
823            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
824        );
825        // (a = "foo" OR a = "bar") OR (b = 1)
826        test_analyze(
827            col("a")
828                .eq(lit("foo"))
829                .or(col("a").eq(lit("bar")))
830                .or(col("b").eq(lit(1))),
831            // can't represent knowledge about a or b in this case
832            vec![],
833        );
834    }
835
836    #[test]
837    fn test_single_inlist() {
838        // b IN (1, 2, 3)
839        test_analyze(
840            col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
841            vec![in_guarantee("b", [1, 2, 3])],
842        );
843        // b NOT IN (1, 2, 3)
844        test_analyze(
845            col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
846            vec![not_in_guarantee("b", [1, 2, 3])],
847        );
848        // b IN (1,2,3,4...24)
849        test_analyze(
850            col("b").in_list((1..25).map(lit).collect_vec(), false),
851            vec![in_guarantee("b", 1..25)],
852        );
853    }
854
855    #[test]
856    fn test_inlist_conjunction() {
857        // b IN (1, 2, 3) AND b IN (2, 3, 4)
858        test_analyze(
859            col("b")
860                .in_list(vec![lit(1), lit(2), lit(3)], false)
861                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
862            vec![in_guarantee("b", [2, 3])],
863        );
864        // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
865        test_analyze(
866            col("b")
867                .in_list(vec![lit(1), lit(2), lit(3)], true)
868                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
869            vec![
870                not_in_guarantee("b", [1, 2, 3]),
871                in_guarantee("b", [2, 3, 4]),
872            ],
873        );
874        // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
875        test_analyze(
876            col("b")
877                .in_list(vec![lit(1), lit(2), lit(3)], true)
878                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
879            vec![not_in_guarantee("b", [1, 2, 3, 4])],
880        );
881        // b IN (1, 2, 3) AND b = 4
882        test_analyze(
883            col("b")
884                .in_list(vec![lit(1), lit(2), lit(3)], false)
885                .and(col("b").eq(lit(4))),
886            vec![],
887        );
888        // b IN (1, 2, 3) AND b = 2
889        test_analyze(
890            col("b")
891                .in_list(vec![lit(1), lit(2), lit(3)], false)
892                .and(col("b").eq(lit(2))),
893            vec![in_guarantee("b", [2])],
894        );
895        // b IN (1, 2, 3) AND b != 2
896        test_analyze(
897            col("b")
898                .in_list(vec![lit(1), lit(2), lit(3)], false)
899                .and(col("b").not_eq(lit(2))),
900            vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
901        );
902        // b NOT IN (1, 2, 3) AND b != 4
903        test_analyze(
904            col("b")
905                .in_list(vec![lit(1), lit(2), lit(3)], true)
906                .and(col("b").not_eq(lit(4))),
907            vec![not_in_guarantee("b", [1, 2, 3, 4])],
908        );
909        // b NOT IN (1, 2, 3) AND b != 2
910        test_analyze(
911            col("b")
912                .in_list(vec![lit(1), lit(2), lit(3)], true)
913                .and(col("b").not_eq(lit(2))),
914            vec![not_in_guarantee("b", [1, 2, 3])],
915        );
916    }
917
918    #[test]
919    fn test_inlist_with_disjunction() {
920        // b IN (1, 2, 3) AND (b = 3 OR b = 4)
921        test_analyze(
922            col("b")
923                .in_list(vec![lit(1), lit(2), lit(3)], false)
924                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
925            vec![in_guarantee("b", [3])],
926        );
927        // b IN (1, 2, 3) AND (b = 4 OR b = 5)
928        test_analyze(
929            col("b")
930                .in_list(vec![lit(1), lit(2), lit(3)], false)
931                .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
932            vec![],
933        );
934        // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
935        test_analyze(
936            col("b")
937                .in_list(vec![lit(1), lit(2), lit(3)], true)
938                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
939            vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
940        );
941        // b IN (1, 2, 3) OR b = 2
942        test_analyze(
943            col("b")
944                .in_list(vec![lit(1), lit(2), lit(3)], false)
945                .or(col("b").eq(lit(2))),
946            vec![in_guarantee("b", [1, 2, 3])],
947        );
948        // b IN (1, 2, 3) OR b != 3
949        test_analyze(
950            col("b")
951                .in_list(vec![lit(1), lit(2), lit(3)], false)
952                .or(col("b").not_eq(lit(3))),
953            vec![],
954        );
955    }
956
957    #[test]
958    fn test_disjunction_and_conjunction_multi_column() {
959        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2)
960        test_analyze(
961            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
962                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
963            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1, 2])],
964        );
965        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (b = 3)
966        test_analyze(
967            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
968                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
969                .or(col("b").eq(lit(3))),
970            vec![in_guarantee("b", [1, 2, 3])],
971        );
972        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (c = 3)
973        test_analyze(
974            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
975                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
976                .or(col("c").eq(lit(3))),
977            vec![],
978        );
979        // (a = "foo" AND b > 1) OR (a = "bar" AND b = 2)
980        test_analyze(
981            (col("a").eq(lit("foo")).and(col("b").gt(lit(1))))
982                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
983            vec![in_guarantee("a", ["foo", "bar"])],
984        );
985        // (a = "foo" AND b = 1) OR (b = 1 AND c = 2) OR (c = 3 AND a = "bar")
986        test_analyze(
987            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
988                .or(col("b").eq(lit(1)).and(col("c").eq(lit(2))))
989                .or(col("c").eq(lit(3)).and(col("a").eq(lit("bar")))),
990            vec![],
991        );
992        // (a = "foo" AND a = "bar") OR (a = "good" AND b = 1)
993        // TODO: this should be `a IN ("good") AND b IN (1)`
994        test_analyze(
995            (col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))))
996                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
997            vec![],
998        );
999        // (a = "foo" AND a = "foo") OR (a = "good" AND b = 1)
1000        // TODO: this should be `a IN ("foo", "good")`
1001        test_analyze(
1002            (col("a").eq(lit("foo")).and(col("a").eq(lit("foo"))))
1003                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
1004            vec![],
1005        );
1006        // (a = "foo" AND b = 3) OR (b = 4 AND b = 1) OR (b = 2 AND a = "bar")
1007        test_analyze(
1008            (col("a").eq(lit("foo")).and(col("b").eq(lit(3))))
1009                .or(col("b").eq(lit(4)).and(col("b").eq(lit(1))))
1010                .or(col("b").eq(lit(2)).and(col("a").eq(lit("bar")))),
1011            vec![],
1012        );
1013        // (b = 1 AND b > 3) OR (a = "foo" AND b = 4)
1014        test_analyze(
1015            (col("b").eq(lit(1)).and(col("b").gt(lit(3))))
1016                .or(col("a").eq(lit("foo")).and(col("b").eq(lit(4)))),
1017            // if b isn't 1 or 4, it can not be true (though the expression actually can never be true)
1018            vec![in_guarantee("b", [1, 4])],
1019        );
1020        // (a = "foo" AND b = 1) OR (a != "bar" AND b = 2)
1021        test_analyze(
1022            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1023                .or(col("a").not_eq(lit("bar")).and(col("b").eq(lit(2)))),
1024            vec![in_guarantee("b", [1, 2])],
1025        );
1026        // (a = "foo" AND b = 1) OR (a LIKE "%bar" AND b = 2)
1027        test_analyze(
1028            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1029                .or(col("a").like(lit("%bar")).and(col("b").eq(lit(2)))),
1030            vec![in_guarantee("b", [1, 2])],
1031        );
1032        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo", "bar") AND b = 6)
1033        test_analyze(
1034            (col("a")
1035                .in_list(vec![lit("foo"), lit("bar")], false)
1036                .and(col("b").eq(lit(5))))
1037            .or(col("a")
1038                .in_list(vec![lit("foo"), lit("bar")], false)
1039                .and(col("b").eq(lit(6)))),
1040            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1041        );
1042        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo") AND b = 6)
1043        test_analyze(
1044            (col("a")
1045                .in_list(vec![lit("foo"), lit("bar")], false)
1046                .and(col("b").eq(lit(5))))
1047            .or(col("a")
1048                .in_list(vec![lit("foo")], false)
1049                .and(col("b").eq(lit(6)))),
1050            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1051        );
1052        // (a NOT IN ("foo", "bar") AND b = 5) OR (a NOT IN ("foo") AND b = 6)
1053        test_analyze(
1054            (col("a")
1055                .in_list(vec![lit("foo"), lit("bar")], true)
1056                .and(col("b").eq(lit(5))))
1057            .or(col("a")
1058                .in_list(vec![lit("foo")], true)
1059                .and(col("b").eq(lit(6)))),
1060            vec![in_guarantee("b", [5, 6])],
1061        );
1062    }
1063
1064    /// Tests that analyzing expr results in the expected guarantees
1065    fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
1066        println!("Begin analyze of {expr}");
1067        let schema = schema();
1068        let physical_expr = logical2physical(&expr, &schema);
1069
1070        let actual = LiteralGuarantee::analyze(&physical_expr)
1071            .into_iter()
1072            .sorted_by_key(|g| g.column.name().to_string())
1073            .collect::<Vec<_>>();
1074        assert_eq!(
1075            expected, actual,
1076            "expr: {expr}\
1077               \n\nexpected: {expected:#?}\
1078               \n\nactual: {actual:#?}\
1079               \n\nexpr: {expr:#?}\
1080               \n\nphysical_expr: {physical_expr:#?}"
1081        );
1082    }
1083
1084    /// Guarantee that the expression is true if the column is one of the specified values
1085    fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1086    where
1087        I: IntoIterator<Item = S>,
1088        S: Into<ScalarValue> + 'a,
1089    {
1090        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1091        LiteralGuarantee::new(column, Guarantee::In, literals.iter())
1092    }
1093
1094    /// Guarantee that the expression is true if the column is NOT any of the specified values
1095    fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1096    where
1097        I: IntoIterator<Item = S>,
1098        S: Into<ScalarValue> + 'a,
1099    {
1100        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1101        LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
1102    }
1103
1104    // Schema for testing
1105    fn schema() -> SchemaRef {
1106        static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
1107            Arc::new(Schema::new(vec![
1108                Field::new("a", DataType::Utf8, false),
1109                Field::new("b", DataType::Int32, false),
1110                Field::new("c", DataType::Int32, false),
1111            ]))
1112        });
1113        Arc::clone(&SCHEMA)
1114    }
1115}