Skip to main content

datafusion_physical_expr/utils/
guarantee.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{PhysicalExpr, split_conjunction};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65///    column takes on any other value, the predicate can not evaluate to `true`.
66///    For example,
67///    `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70///    If the column can ONLY take one of these values, the predicate can not
71///    evaluate to `true`. For example,
72///    `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75    pub column: Column,
76    pub guarantee: Guarantee,
77    pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83    /// Guarantee that the expression is `true` if `column` is one of the values. If
84    /// `column` is not one of the values, the expression can not be `true`.
85    In,
86    /// Guarantee that the expression is `true` if `column` is not ANY of the
87    /// values. If `column` only takes one of these values, the expression can
88    /// not be `true`.
89    NotIn,
90}
91
92impl LiteralGuarantee {
93    /// Create a new instance of the guarantee if the provided operator is
94    /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95    /// create these structures from an predicate (boolean expression).
96    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
97    fn new<'a>(
98        column_name: impl Into<String>,
99        guarantee: Guarantee,
100        literals: impl IntoIterator<Item = &'a ScalarValue>,
101    ) -> Self {
102        let literals: HashSet<_> = literals.into_iter().cloned().collect();
103
104        Self {
105            column: Column::from_name(column_name),
106            guarantee,
107            literals,
108        }
109    }
110
111    /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
112    /// to evaluate to `true`.
113    ///
114    /// If more than one `LiteralGuarantee` is returned, they must **all** hold
115    /// for the expression to possibly be `true`. If any is not satisfied, the
116    /// expression is guaranteed to be `null` or `false`.
117    ///
118    /// # Notes:
119    /// 1. `expr` must be a boolean expression or inlist expression.
120    /// 2. `expr` is not simplified prior to analysis.
121    pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
122        // split conjunction: <expr> AND <expr> AND ...
123        split_conjunction(expr)
124            .into_iter()
125            // for an `AND` conjunction to be true, all terms individually must be true
126            .fold(GuaranteeBuilder::new(), |builder, expr| {
127                if let Some(cel) = ColOpLit::try_new(expr) {
128                    builder.aggregate_conjunct(&cel)
129                } else if let Some(inlist) =
130                    expr.downcast_ref::<crate::expressions::InListExpr>()
131                {
132                    if let Some(inlist) = ColInList::try_new(inlist) {
133                        builder.aggregate_multi_conjunct(
134                            inlist.col,
135                            inlist.guarantee,
136                            inlist.list.iter().map(|lit| lit.value()),
137                        )
138                    } else {
139                        builder
140                    }
141                } else {
142                    // split disjunction: <expr> OR <expr> OR ...
143                    let disjunctions = split_disjunction(expr);
144
145                    // We are trying to add a guarantee that a column must be
146                    // in/not in a particular set of values for the expression
147                    // to evaluate to true.
148                    //
149                    // A disjunction is true, if at least one of the terms is be
150                    // true.
151                    //
152                    // Thus, we can infer a guarantee if all terms are of the
153                    // form `(col <op> literal) OR (col <op> literal) OR ...`.
154                    //
155                    // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
156                    // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
157                    //
158                    // However, for something like  `a = 1 OR a = 2 OR a < 0` we
159                    // **can't** guarantee that the predicate is only true if a
160                    // is in (`1`, `2`), as it could also be true if `a` were less
161                    // than zero.
162                    let terms = disjunctions
163                        .iter()
164                        .filter_map(|expr| ColOpLit::try_new(expr))
165                        .collect::<Vec<_>>();
166
167                    // if all terms are 'col <op> literal' with the same column
168                    // and operation we can infer any guarantees
169                    //
170                    // For those like (a != foo AND (a != bar OR a != baz)).
171                    // We can't combine the (a != bar OR a != baz) part, but
172                    // it also doesn't invalidate our knowledge that a !=
173                    // foo is required for the expression to be true.
174                    // So we can only create a multi value guarantee for `=`
175                    // (or a single value). (e.g. ignore `a != foo OR a != bar`)
176                    let first_term = terms.first();
177                    if !terms.is_empty()
178                        && terms.len() == disjunctions.len()
179                        && terms.iter().all(|term| {
180                            term.col.name() == first_term.unwrap().col.name()
181                                && term.guarantee == Guarantee::In
182                        })
183                    {
184                        builder.aggregate_multi_conjunct(
185                            first_term.unwrap().col,
186                            Guarantee::In,
187                            terms.iter().map(|term| term.lit.value()),
188                        )
189                    } else {
190                        // Handle disjunctions with conjunctions like (a = 1 AND b = 2) OR (a = 2 AND b = 3)
191                        // Extract termsets from each disjunction
192                        // if in each termset, they have same column, and the guarantee is In,
193                        // we can infer a guarantee for the column
194                        // e.g. (a = 1 AND b = 2) OR (a = 2 AND b = 3) is `a IN (1, 2) AND b IN (2, 3)`
195                        // otherwise, we can't infer a guarantee
196                        let termsets: Vec<Vec<ColOpLitOrInList>> = disjunctions
197                            .iter()
198                            .map(|expr| {
199                                split_conjunction(expr)
200                                    .into_iter()
201                                    .filter_map(ColOpLitOrInList::try_new)
202                                    .filter(|term| term.guarantee() == Guarantee::In)
203                                    .collect()
204                            })
205                            .collect();
206
207                        // Early return if any termset is empty (can't infer guarantees)
208                        if termsets.iter().any(|terms| terms.is_empty()) {
209                            return builder;
210                        }
211
212                        // Find columns that appear in all termsets
213                        let common_cols = find_common_columns(&termsets);
214                        if common_cols.is_empty() {
215                            return builder;
216                        }
217
218                        // Build guarantees for common columns
219                        let mut builder = builder;
220                        for col in common_cols {
221                            let literals: Vec<_> = termsets
222                                .iter()
223                                .filter_map(|terms| {
224                                    terms.iter().find(|term| term.col() == col).map(
225                                        |term| {
226                                            term.lits().into_iter().map(|lit| lit.value())
227                                        },
228                                    )
229                                })
230                                .flatten()
231                                .collect();
232
233                            builder = builder.aggregate_multi_conjunct(
234                                col,
235                                Guarantee::In,
236                                literals,
237                            );
238                        }
239
240                        builder
241                    }
242                }
243            })
244            .build()
245    }
246}
247
248impl Display for LiteralGuarantee {
249    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
250        let mut sorted_literals: Vec<_> =
251            self.literals.iter().map(|lit| lit.to_string()).collect();
252        sorted_literals.sort();
253        match self.guarantee {
254            Guarantee::In => write!(
255                f,
256                "{} in ({})",
257                self.column.name,
258                sorted_literals.join(", ")
259            ),
260            Guarantee::NotIn => write!(
261                f,
262                "{} not in ({})",
263                self.column.name,
264                sorted_literals.join(", ")
265            ),
266        }
267    }
268}
269
270/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
271/// preserving insert order
272#[derive(Debug, Default)]
273struct GuaranteeBuilder<'a> {
274    /// List of guarantees that have been created so far
275    /// if we have determined a subsequent conjunct invalidates a guarantee
276    /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
277    guarantees: Vec<Option<LiteralGuarantee>>,
278
279    /// Key is the (column name, guarantee type)
280    /// Value is the index into `guarantees`
281    map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
282}
283
284impl<'a> GuaranteeBuilder<'a> {
285    fn new() -> Self {
286        Default::default()
287    }
288
289    /// Aggregate a new single `AND col <op> literal` term to this builder
290    /// combining with existing guarantees if possible.
291    ///
292    /// # Examples
293    /// * `AND (a = 1)`: `a` is guaranteed to be 1
294    /// * `AND (a != 1)`: a is guaranteed to not be 1
295    fn aggregate_conjunct(self, col_op_lit: &ColOpLit<'a>) -> Self {
296        self.aggregate_multi_conjunct(
297            col_op_lit.col,
298            col_op_lit.guarantee,
299            [col_op_lit.lit.value()],
300        )
301    }
302
303    /// Aggregates a new single column, multi literal term to this builder
304    /// combining with previously known guarantees if possible.
305    ///
306    /// # Examples
307    /// For the following examples, we can guarantee the expression is `true` if:
308    /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
309    /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
310    /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
311    /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
312    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
313    fn aggregate_multi_conjunct(
314        mut self,
315        col: &'a crate::expressions::Column,
316        guarantee: Guarantee,
317        new_values: impl IntoIterator<Item = &'a ScalarValue>,
318    ) -> Self {
319        let key = (col, guarantee);
320        if let Some(index) = self.map.get(&key) {
321            // already have a guarantee for this column
322            let entry = &mut self.guarantees[*index];
323
324            let Some(existing) = entry else {
325                // determined the previous guarantee for this column has been
326                // invalidated, nothing to do
327                return self;
328            };
329
330            // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
331            // AND a = bar` doesn't make logical sense so we don't optimize this
332            // case
333            match existing.guarantee {
334                // knew that the column could not be a set of values
335                //
336                // For example, if we previously had `a != 5` and now we see
337                // another `AND a != 6` we know that a must not be either 5 or 6
338                // for the expression to be true
339                Guarantee::NotIn => {
340                    let new_values: HashSet<_> = new_values.into_iter().collect();
341                    existing.literals.extend(new_values.into_iter().cloned());
342                }
343                Guarantee::In => {
344                    let intersection = new_values
345                        .into_iter()
346                        .filter(|new_value| existing.literals.contains(*new_value))
347                        .collect::<Vec<_>>();
348                    // for an In guarantee, if the intersection is not empty,  we can extend the guarantee
349                    // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
350                    // otherwise, we invalidate the guarantee
351                    // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
352                    if !intersection.is_empty() {
353                        existing.literals = intersection.into_iter().cloned().collect();
354                    } else {
355                        // at least one was not, so invalidate the guarantee
356                        *entry = None;
357                    }
358                }
359            }
360        } else {
361            // This is a new guarantee
362            let new_values: HashSet<_> = new_values.into_iter().collect();
363
364            let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
365            // add it to the list of guarantees
366            self.guarantees.push(Some(guarantee));
367            self.map.insert(key, self.guarantees.len() - 1);
368        }
369
370        self
371    }
372
373    /// Return all guarantees that have been created so far
374    fn build(self) -> Vec<LiteralGuarantee> {
375        // filter out any guarantees that have been invalidated
376        self.guarantees.into_iter().flatten().collect()
377    }
378}
379
380/// Represents a single `col [not]in literal` expression
381struct ColOpLit<'a> {
382    col: &'a crate::expressions::Column,
383    guarantee: Guarantee,
384    lit: &'a crate::expressions::Literal,
385}
386
387impl<'a> ColOpLit<'a> {
388    /// Returns Some(ColOpLit) if the expression is either:
389    /// 1. `col <op> literal`
390    /// 2. `literal <op> col`
391    /// 3. operator is `=` or `!=`
392    ///
393    /// Returns None otherwise
394    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
395        let binary_expr = expr.downcast_ref::<crate::expressions::BinaryExpr>()?;
396
397        let (left, op, right) =
398            (binary_expr.left(), binary_expr.op(), binary_expr.right());
399        let guarantee = match op {
400            Operator::Eq => Guarantee::In,
401            Operator::NotEq => Guarantee::NotIn,
402            _ => return None,
403        };
404        // col <op> literal
405        if let (Some(col), Some(lit)) = (
406            left.downcast_ref::<crate::expressions::Column>(),
407            right.downcast_ref::<crate::expressions::Literal>(),
408        ) {
409            Some(Self {
410                col,
411                guarantee,
412                lit,
413            })
414        }
415        // literal <op> col
416        else if let (Some(lit), Some(col)) = (
417            left.downcast_ref::<crate::expressions::Literal>(),
418            right.downcast_ref::<crate::expressions::Column>(),
419        ) {
420            Some(Self {
421                col,
422                guarantee,
423                lit,
424            })
425        } else {
426            None
427        }
428    }
429}
430
431/// Represents a single `col [not]in literal` expression
432struct ColInList<'a> {
433    col: &'a crate::expressions::Column,
434    guarantee: Guarantee,
435    list: Vec<&'a crate::expressions::Literal>,
436}
437
438impl<'a> ColInList<'a> {
439    /// Returns Some(ColInList) if the expression is either:
440    /// 1. `col <op> (literal1, literal2, ...)`
441    /// 3. operator is `in` or `not in`
442    ///
443    /// Returns None otherwise
444    fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
445        // Only support single-column inlist currently, multi-column inlist is not supported
446        let col = inlist.expr().downcast_ref::<crate::expressions::Column>()?;
447
448        let literals = inlist
449            .list()
450            .iter()
451            .map(|e| e.downcast_ref::<crate::expressions::Literal>())
452            .collect::<Option<Vec<_>>>()?;
453
454        let guarantee = if inlist.negated() {
455            Guarantee::NotIn
456        } else {
457            Guarantee::In
458        };
459
460        Some(Self {
461            col,
462            guarantee,
463            list: literals,
464        })
465    }
466}
467
468/// Represents a single `col [not]in literal` expression or a single `col <op> literal` expression
469enum ColOpLitOrInList<'a> {
470    ColOpLit(ColOpLit<'a>),
471    ColInList(ColInList<'a>),
472}
473
474impl<'a> ColOpLitOrInList<'a> {
475    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
476        match expr.downcast_ref::<crate::expressions::InListExpr>() {
477            Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
478            None => ColOpLit::try_new(expr).map(Self::ColOpLit),
479        }
480    }
481
482    fn guarantee(&self) -> Guarantee {
483        match self {
484            Self::ColOpLit(col_op_lit) => col_op_lit.guarantee,
485            Self::ColInList(col_in_list) => col_in_list.guarantee,
486        }
487    }
488
489    fn col(&self) -> &'a crate::expressions::Column {
490        match self {
491            Self::ColOpLit(col_op_lit) => col_op_lit.col,
492            Self::ColInList(col_in_list) => col_in_list.col,
493        }
494    }
495
496    fn lits(&self) -> Vec<&'a crate::expressions::Literal> {
497        match self {
498            Self::ColOpLit(col_op_lit) => vec![col_op_lit.lit],
499            Self::ColInList(col_in_list) => col_in_list.list.clone(),
500        }
501    }
502}
503
504/// Find columns that appear in all termsets
505fn find_common_columns<'a>(
506    termsets: &[Vec<ColOpLitOrInList<'a>>],
507) -> Vec<&'a crate::expressions::Column> {
508    if termsets.is_empty() {
509        return Vec::new();
510    }
511
512    // Start with columns from the first termset
513    let mut common_cols: HashSet<_> = termsets[0].iter().map(|term| term.col()).collect();
514
515    // check if any common_col in one termset occur many times
516    // e.g. (a = 1 AND a = 2) OR (a = 2 AND b = 3), should not infer a guarantee
517    // TODO: for above case, we can infer a IN (2) AND b IN (3)
518    if common_cols.len() != termsets[0].len() {
519        return Vec::new();
520    }
521
522    // Intersect with columns from remaining termsets
523    for termset in termsets.iter().skip(1) {
524        let termset_cols: HashSet<_> = termset.iter().map(|term| term.col()).collect();
525        if termset_cols.len() != termset.len() {
526            return Vec::new();
527        }
528        common_cols = common_cols.intersection(&termset_cols).cloned().collect();
529    }
530
531    common_cols.into_iter().collect()
532}
533
534#[cfg(test)]
535mod test {
536    use std::sync::LazyLock;
537
538    use super::*;
539    use crate::planner::logical2physical;
540
541    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
542    use datafusion_expr::expr_fn::*;
543    use datafusion_expr::{Expr, lit};
544
545    use itertools::Itertools;
546
547    #[test]
548    fn test_literal() {
549        // a single literal offers no guarantee
550        test_analyze(lit(true), vec![])
551    }
552
553    #[test]
554    fn test_single() {
555        // a = "foo"
556        test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
557        // "foo" = a
558        test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
559        // a != "foo"
560        test_analyze(
561            col("a").not_eq(lit("foo")),
562            vec![not_in_guarantee("a", ["foo"])],
563        );
564        // "foo" != a
565        test_analyze(
566            lit("foo").not_eq(col("a")),
567            vec![not_in_guarantee("a", ["foo"])],
568        );
569    }
570
571    #[test]
572    fn test_conjunction_single_column() {
573        // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
574        test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
575        // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
576        test_analyze(
577            col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
578            vec![
579                // can only be true of b is 1 and b is not 2 (even though it is redundant)
580                in_guarantee("b", [1]),
581                not_in_guarantee("b", [2]),
582            ],
583        );
584        // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
585        test_analyze(
586            col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
587            vec![
588                // can only be true of b is not 1 and b is 2 (even though it is redundant)
589                not_in_guarantee("b", [1]),
590                in_guarantee("b", [2]),
591            ],
592        );
593        // b != 1 AND b != 2
594        test_analyze(
595            col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
596            vec![not_in_guarantee("b", [1, 2])],
597        );
598        // b != 1 AND b != 2 and b != 3
599        test_analyze(
600            col("b")
601                .not_eq(lit(1))
602                .and(col("b").not_eq(lit(2)))
603                .and(col("b").not_eq(lit(3))),
604            vec![not_in_guarantee("b", [1, 2, 3])],
605        );
606        // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
607        test_analyze(
608            col("b")
609                .not_eq(lit(1))
610                .and(col("b").eq(lit(2)))
611                .and(col("b").not_eq(lit(3))),
612            vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
613        );
614        // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
615        test_analyze(
616            col("b")
617                .not_eq(lit(1))
618                .and(col("b").not_eq(lit(2)))
619                .and(col("b").eq(lit(3))),
620            vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
621        );
622        // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
623        test_analyze(
624            col("b")
625                .not_eq(lit(1))
626                .and(col("b").not_eq(lit(2)))
627                .and(col("b").gt(lit(3))),
628            vec![not_in_guarantee("b", [1, 2])],
629        );
630    }
631
632    #[test]
633    fn test_conjunction_multi_column() {
634        // a = "foo" AND b = 1
635        test_analyze(
636            col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
637            vec![
638                // should find both column guarantees
639                in_guarantee("a", ["foo"]),
640                in_guarantee("b", [1]),
641            ],
642        );
643        // a != "foo" AND b != 1
644        test_analyze(
645            col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
646            // should find both column guarantees
647            vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
648        );
649        // a = "foo" AND a = "bar"
650        test_analyze(
651            col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
652            // this predicate is impossible ( can't be both foo and bar),
653            vec![],
654        );
655        // a = "foo" AND b != "bar"
656        test_analyze(
657            col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
658            vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
659        );
660        // a != "foo" AND a != "bar"
661        test_analyze(
662            col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
663            // know it isn't "foo" or "bar"
664            vec![not_in_guarantee("a", ["foo", "bar"])],
665        );
666        // a != "foo" AND a != "bar" and a != "baz"
667        test_analyze(
668            col("a")
669                .not_eq(lit("foo"))
670                .and(col("a").not_eq(lit("bar")))
671                .and(col("a").not_eq(lit("baz"))),
672            // know it isn't "foo" or "bar" or "baz"
673            vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
674        );
675        // a = "foo" AND a = "foo"
676        let expr = col("a").eq(lit("foo"));
677        test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
678        // b > 5 AND b = 10 (should get an b = 10 guarantee)
679        test_analyze(
680            col("b").gt(lit(5)).and(col("b").eq(lit(10))),
681            vec![in_guarantee("b", [10])],
682        );
683        // b > 10 AND b = 10 (this is impossible)
684        test_analyze(
685            col("b").gt(lit(10)).and(col("b").eq(lit(10))),
686            vec![
687                //  if b isn't 10, it can not be true (though the expression actually can never be true)
688                in_guarantee("b", [10]),
689            ],
690        );
691        // a != "foo" and (a != "bar" OR a != "baz")
692        test_analyze(
693            col("a")
694                .not_eq(lit("foo"))
695                .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
696            // a is not foo (we can't represent other knowledge about a)
697            vec![not_in_guarantee("a", ["foo"])],
698        );
699    }
700
701    #[test]
702    fn test_conjunction_and_disjunction_single_column() {
703        // b != 1 AND (b > 2)
704        test_analyze(
705            col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
706            vec![
707                // for the expression to be true, b can not be one
708                not_in_guarantee("b", [1]),
709            ],
710        );
711
712        // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
713        test_analyze(
714            col("b")
715                .eq(lit(1))
716                .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
717            vec![
718                // in theory, b must be 1 and one of 2,3 for this expression to be true
719                // which is a logical contradiction
720            ],
721        );
722    }
723
724    #[test]
725    fn test_disjunction_single_column() {
726        // b = 1 OR b = 2
727        test_analyze(
728            col("b").eq(lit(1)).or(col("b").eq(lit(2))),
729            vec![in_guarantee("b", [1, 2])],
730        );
731        // b != 1 OR b = 2
732        test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
733        // b = 1 OR b != 2
734        test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
735        // b != 1 OR b != 2
736        test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
737        // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
738        test_analyze(
739            col("b")
740                .not_eq(lit(1))
741                .or(col("b").not_eq(lit(2)))
742                .or(lit("b").eq(lit(3))),
743            vec![],
744        );
745        // b = 1 OR b = 2 OR b = 3
746        test_analyze(
747            col("b")
748                .eq(lit(1))
749                .or(col("b").eq(lit(2)))
750                .or(col("b").eq(lit(3))),
751            vec![in_guarantee("b", [1, 2, 3])],
752        );
753        // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
754        test_analyze(
755            col("b")
756                .eq(lit(1))
757                .or(col("b").eq(lit(2)))
758                .or(lit("b").eq(lit(3))),
759            vec![],
760        );
761    }
762
763    #[test]
764    fn test_disjunction_multi_column() {
765        // a = "foo" OR b = 1
766        test_analyze(
767            col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
768            // no can't have a single column guarantee (if a = "foo" then b != 1) etc
769            vec![],
770        );
771        // a != "foo" OR b != 1
772        test_analyze(
773            col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
774            // No single column guarantee
775            vec![],
776        );
777        // a = "foo" OR a = "bar"
778        test_analyze(
779            col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
780            vec![in_guarantee("a", ["foo", "bar"])],
781        );
782        // a = "foo" OR a = "foo"
783        test_analyze(
784            col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
785            vec![in_guarantee("a", ["foo"])],
786        );
787        // a != "foo" OR a != "bar"
788        test_analyze(
789            col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
790            // can't represent knowledge about a in this case
791            vec![],
792        );
793        // a = "foo" OR a = "bar" OR a = "baz"
794        test_analyze(
795            col("a")
796                .eq(lit("foo"))
797                .or(col("a").eq(lit("bar")))
798                .or(col("a").eq(lit("baz"))),
799            vec![in_guarantee("a", ["foo", "bar", "baz"])],
800        );
801        // (a = "foo" OR a = "bar") AND (a = "baz)"
802        test_analyze(
803            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
804                .and(col("a").eq(lit("baz"))),
805            // this could potentially be represented as 2 constraints with a more
806            // sophisticated analysis
807            vec![],
808        );
809        // (a = "foo" OR a = "bar") AND (b = 1)
810        test_analyze(
811            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
812                .and(col("b").eq(lit(1))),
813            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
814        );
815        // (a = "foo" OR a = "bar") OR (b = 1)
816        test_analyze(
817            col("a")
818                .eq(lit("foo"))
819                .or(col("a").eq(lit("bar")))
820                .or(col("b").eq(lit(1))),
821            // can't represent knowledge about a or b in this case
822            vec![],
823        );
824    }
825
826    #[test]
827    fn test_single_inlist() {
828        // b IN (1, 2, 3)
829        test_analyze(
830            col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
831            vec![in_guarantee("b", [1, 2, 3])],
832        );
833        // b NOT IN (1, 2, 3)
834        test_analyze(
835            col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
836            vec![not_in_guarantee("b", [1, 2, 3])],
837        );
838        // b IN (1,2,3,4...24)
839        test_analyze(
840            col("b").in_list((1..25).map(lit).collect_vec(), false),
841            vec![in_guarantee("b", 1..25)],
842        );
843    }
844
845    #[test]
846    fn test_inlist_conjunction() {
847        // b IN (1, 2, 3) AND b IN (2, 3, 4)
848        test_analyze(
849            col("b")
850                .in_list(vec![lit(1), lit(2), lit(3)], false)
851                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
852            vec![in_guarantee("b", [2, 3])],
853        );
854        // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
855        test_analyze(
856            col("b")
857                .in_list(vec![lit(1), lit(2), lit(3)], true)
858                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
859            vec![
860                not_in_guarantee("b", [1, 2, 3]),
861                in_guarantee("b", [2, 3, 4]),
862            ],
863        );
864        // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
865        test_analyze(
866            col("b")
867                .in_list(vec![lit(1), lit(2), lit(3)], true)
868                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
869            vec![not_in_guarantee("b", [1, 2, 3, 4])],
870        );
871        // b IN (1, 2, 3) AND b = 4
872        test_analyze(
873            col("b")
874                .in_list(vec![lit(1), lit(2), lit(3)], false)
875                .and(col("b").eq(lit(4))),
876            vec![],
877        );
878        // b IN (1, 2, 3) AND b = 2
879        test_analyze(
880            col("b")
881                .in_list(vec![lit(1), lit(2), lit(3)], false)
882                .and(col("b").eq(lit(2))),
883            vec![in_guarantee("b", [2])],
884        );
885        // b IN (1, 2, 3) AND b != 2
886        test_analyze(
887            col("b")
888                .in_list(vec![lit(1), lit(2), lit(3)], false)
889                .and(col("b").not_eq(lit(2))),
890            vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
891        );
892        // b NOT IN (1, 2, 3) AND b != 4
893        test_analyze(
894            col("b")
895                .in_list(vec![lit(1), lit(2), lit(3)], true)
896                .and(col("b").not_eq(lit(4))),
897            vec![not_in_guarantee("b", [1, 2, 3, 4])],
898        );
899        // b NOT IN (1, 2, 3) AND b != 2
900        test_analyze(
901            col("b")
902                .in_list(vec![lit(1), lit(2), lit(3)], true)
903                .and(col("b").not_eq(lit(2))),
904            vec![not_in_guarantee("b", [1, 2, 3])],
905        );
906    }
907
908    #[test]
909    fn test_inlist_with_disjunction() {
910        // b IN (1, 2, 3) AND (b = 3 OR b = 4)
911        test_analyze(
912            col("b")
913                .in_list(vec![lit(1), lit(2), lit(3)], false)
914                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
915            vec![in_guarantee("b", [3])],
916        );
917        // b IN (1, 2, 3) AND (b = 4 OR b = 5)
918        test_analyze(
919            col("b")
920                .in_list(vec![lit(1), lit(2), lit(3)], false)
921                .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
922            vec![],
923        );
924        // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
925        test_analyze(
926            col("b")
927                .in_list(vec![lit(1), lit(2), lit(3)], true)
928                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
929            vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
930        );
931        // b IN (1, 2, 3) OR b = 2
932        test_analyze(
933            col("b")
934                .in_list(vec![lit(1), lit(2), lit(3)], false)
935                .or(col("b").eq(lit(2))),
936            vec![in_guarantee("b", [1, 2, 3])],
937        );
938        // b IN (1, 2, 3) OR b != 3
939        test_analyze(
940            col("b")
941                .in_list(vec![lit(1), lit(2), lit(3)], false)
942                .or(col("b").not_eq(lit(3))),
943            vec![],
944        );
945    }
946
947    #[test]
948    fn test_disjunction_and_conjunction_multi_column() {
949        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2)
950        test_analyze(
951            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
952                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
953            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1, 2])],
954        );
955        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (b = 3)
956        test_analyze(
957            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
958                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
959                .or(col("b").eq(lit(3))),
960            vec![in_guarantee("b", [1, 2, 3])],
961        );
962        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (c = 3)
963        test_analyze(
964            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
965                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
966                .or(col("c").eq(lit(3))),
967            vec![],
968        );
969        // (a = "foo" AND b > 1) OR (a = "bar" AND b = 2)
970        test_analyze(
971            (col("a").eq(lit("foo")).and(col("b").gt(lit(1))))
972                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
973            vec![in_guarantee("a", ["foo", "bar"])],
974        );
975        // (a = "foo" AND b = 1) OR (b = 1 AND c = 2) OR (c = 3 AND a = "bar")
976        test_analyze(
977            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
978                .or(col("b").eq(lit(1)).and(col("c").eq(lit(2))))
979                .or(col("c").eq(lit(3)).and(col("a").eq(lit("bar")))),
980            vec![],
981        );
982        // (a = "foo" AND a = "bar") OR (a = "good" AND b = 1)
983        // TODO: this should be `a IN ("good") AND b IN (1)`
984        test_analyze(
985            (col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))))
986                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
987            vec![],
988        );
989        // (a = "foo" AND a = "foo") OR (a = "good" AND b = 1)
990        // TODO: this should be `a IN ("foo", "good")`
991        test_analyze(
992            (col("a").eq(lit("foo")).and(col("a").eq(lit("foo"))))
993                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
994            vec![],
995        );
996        // (a = "foo" AND b = 3) OR (b = 4 AND b = 1) OR (b = 2 AND a = "bar")
997        test_analyze(
998            (col("a").eq(lit("foo")).and(col("b").eq(lit(3))))
999                .or(col("b").eq(lit(4)).and(col("b").eq(lit(1))))
1000                .or(col("b").eq(lit(2)).and(col("a").eq(lit("bar")))),
1001            vec![],
1002        );
1003        // (b = 1 AND b > 3) OR (a = "foo" AND b = 4)
1004        test_analyze(
1005            (col("b").eq(lit(1)).and(col("b").gt(lit(3))))
1006                .or(col("a").eq(lit("foo")).and(col("b").eq(lit(4)))),
1007            // if b isn't 1 or 4, it can not be true (though the expression actually can never be true)
1008            vec![in_guarantee("b", [1, 4])],
1009        );
1010        // (a = "foo" AND b = 1) OR (a != "bar" AND b = 2)
1011        test_analyze(
1012            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1013                .or(col("a").not_eq(lit("bar")).and(col("b").eq(lit(2)))),
1014            vec![in_guarantee("b", [1, 2])],
1015        );
1016        // (a = "foo" AND b = 1) OR (a LIKE "%bar" AND b = 2)
1017        test_analyze(
1018            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1019                .or(col("a").like(lit("%bar")).and(col("b").eq(lit(2)))),
1020            vec![in_guarantee("b", [1, 2])],
1021        );
1022        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo", "bar") AND b = 6)
1023        test_analyze(
1024            (col("a")
1025                .in_list(vec![lit("foo"), lit("bar")], false)
1026                .and(col("b").eq(lit(5))))
1027            .or(col("a")
1028                .in_list(vec![lit("foo"), lit("bar")], false)
1029                .and(col("b").eq(lit(6)))),
1030            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1031        );
1032        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo") AND b = 6)
1033        test_analyze(
1034            (col("a")
1035                .in_list(vec![lit("foo"), lit("bar")], false)
1036                .and(col("b").eq(lit(5))))
1037            .or(col("a")
1038                .in_list(vec![lit("foo")], false)
1039                .and(col("b").eq(lit(6)))),
1040            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1041        );
1042        // (a NOT IN ("foo", "bar") AND b = 5) OR (a NOT IN ("foo") AND b = 6)
1043        test_analyze(
1044            (col("a")
1045                .in_list(vec![lit("foo"), lit("bar")], true)
1046                .and(col("b").eq(lit(5))))
1047            .or(col("a")
1048                .in_list(vec![lit("foo")], true)
1049                .and(col("b").eq(lit(6)))),
1050            vec![in_guarantee("b", [5, 6])],
1051        );
1052    }
1053
1054    /// Tests that analyzing expr results in the expected guarantees
1055    fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
1056        println!("Begin analyze of {expr}");
1057        let schema = schema();
1058        let physical_expr = logical2physical(&expr, &schema);
1059
1060        let actual = LiteralGuarantee::analyze(&physical_expr)
1061            .into_iter()
1062            .sorted_by_key(|g| g.column.name().to_string())
1063            .collect::<Vec<_>>();
1064        assert_eq!(
1065            expected, actual,
1066            "expr: {expr}\
1067               \n\nexpected: {expected:#?}\
1068               \n\nactual: {actual:#?}\
1069               \n\nexpr: {expr:#?}\
1070               \n\nphysical_expr: {physical_expr:#?}"
1071        );
1072    }
1073
1074    /// Guarantee that the expression is true if the column is one of the specified values
1075    fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1076    where
1077        I: IntoIterator<Item = S>,
1078        S: Into<ScalarValue> + 'a,
1079    {
1080        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1081        LiteralGuarantee::new(column, Guarantee::In, literals.iter())
1082    }
1083
1084    /// Guarantee that the expression is true if the column is NOT any of the specified values
1085    fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1086    where
1087        I: IntoIterator<Item = S>,
1088        S: Into<ScalarValue> + 'a,
1089    {
1090        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1091        LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
1092    }
1093
1094    // Schema for testing
1095    fn schema() -> SchemaRef {
1096        static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
1097            Arc::new(Schema::new(vec![
1098                Field::new("a", DataType::Utf8, false),
1099                Field::new("b", DataType::Int32, false),
1100                Field::new("c", DataType::Int32, false),
1101            ]))
1102        });
1103        Arc::clone(&SCHEMA)
1104    }
1105}