datafusion_physical_expr/utils/guarantee.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{split_conjunction, PhysicalExpr};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65/// column takes on any other value, the predicate can not evaluate to `true`.
66/// For example,
67/// `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70/// If the column can ONLY take one of these values, the predicate can not
71/// evaluate to `true`. For example,
72/// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75 pub column: Column,
76 pub guarantee: Guarantee,
77 pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83 /// Guarantee that the expression is `true` if `column` is one of the values. If
84 /// `column` is not one of the values, the expression can not be `true`.
85 In,
86 /// Guarantee that the expression is `true` if `column` is not ANY of the
87 /// values. If `column` only takes one of these values, the expression can
88 /// not be `true`.
89 NotIn,
90}
91
92impl LiteralGuarantee {
93 /// Create a new instance of the guarantee if the provided operator is
94 /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95 /// create these structures from an predicate (boolean expression).
96 fn new<'a>(
97 column_name: impl Into<String>,
98 guarantee: Guarantee,
99 literals: impl IntoIterator<Item = &'a ScalarValue>,
100 ) -> Self {
101 let literals: HashSet<_> = literals.into_iter().cloned().collect();
102
103 Self {
104 column: Column::from_name(column_name),
105 guarantee,
106 literals,
107 }
108 }
109
110 /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
111 /// to evaluate to `true`.
112 ///
113 /// If more than one `LiteralGuarantee` is returned, they must **all** hold
114 /// for the expression to possibly be `true`. If any is not satisfied, the
115 /// expression is guaranteed to be `null` or `false`.
116 ///
117 /// # Notes:
118 /// 1. `expr` must be a boolean expression or inlist expression.
119 /// 2. `expr` is not simplified prior to analysis.
120 pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
121 // split conjunction: <expr> AND <expr> AND ...
122 split_conjunction(expr)
123 .into_iter()
124 // for an `AND` conjunction to be true, all terms individually must be true
125 .fold(GuaranteeBuilder::new(), |builder, expr| {
126 if let Some(cel) = ColOpLit::try_new(expr) {
127 builder.aggregate_conjunct(cel)
128 } else if let Some(inlist) = expr
129 .as_any()
130 .downcast_ref::<crate::expressions::InListExpr>()
131 {
132 if let Some(inlist) = ColInList::try_new(inlist) {
133 builder.aggregate_multi_conjunct(
134 inlist.col,
135 inlist.guarantee,
136 inlist.list.iter().map(|lit| lit.value()),
137 )
138 } else {
139 builder
140 }
141 } else {
142 // split disjunction: <expr> OR <expr> OR ...
143 let disjunctions = split_disjunction(expr);
144
145 // We are trying to add a guarantee that a column must be
146 // in/not in a particular set of values for the expression
147 // to evaluate to true.
148 //
149 // A disjunction is true, if at least one of the terms is be
150 // true.
151 //
152 // Thus, we can infer a guarantee if all terms are of the
153 // form `(col <op> literal) OR (col <op> literal) OR ...`.
154 //
155 // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
156 // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
157 //
158 // However, for something like `a = 1 OR a = 2 OR a < 0` we
159 // **can't** guarantee that the predicate is only true if a
160 // is in (`1`, `2`), as it could also be true if `a` were less
161 // than zero.
162 let terms = disjunctions
163 .iter()
164 .filter_map(|expr| ColOpLit::try_new(expr))
165 .collect::<Vec<_>>();
166
167 // if all terms are 'col <op> literal' with the same column
168 // and operation we can infer any guarantees
169 //
170 // For those like (a != foo AND (a != bar OR a != baz)).
171 // We can't combine the (a != bar OR a != baz) part, but
172 // it also doesn't invalidate our knowledge that a !=
173 // foo is required for the expression to be true.
174 // So we can only create a multi value guarantee for `=`
175 // (or a single value). (e.g. ignore `a != foo OR a != bar`)
176 let first_term = terms.first();
177 if !terms.is_empty()
178 && terms.len() == disjunctions.len()
179 && terms.iter().all(|term| {
180 term.col.name() == first_term.unwrap().col.name()
181 && term.guarantee == Guarantee::In
182 })
183 {
184 builder.aggregate_multi_conjunct(
185 first_term.unwrap().col,
186 Guarantee::In,
187 terms.iter().map(|term| term.lit.value()),
188 )
189 } else {
190 // Handle disjunctions with conjunctions like (a = 1 AND b = 2) OR (a = 2 AND b = 3)
191 // Extract termsets from each disjunction
192 // if in each termset, they have same column, and the guarantee is In,
193 // we can infer a guarantee for the column
194 // e.g. (a = 1 AND b = 2) OR (a = 2 AND b = 3) is `a IN (1, 2) AND b IN (2, 3)`
195 // otherwise, we can't infer a guarantee
196 let termsets: Vec<Vec<ColOpLitOrInList>> = disjunctions
197 .iter()
198 .map(|expr| {
199 split_conjunction(expr)
200 .into_iter()
201 .filter_map(ColOpLitOrInList::try_new)
202 .filter(|term| term.guarantee() == Guarantee::In)
203 .collect()
204 })
205 .collect();
206
207 // Early return if any termset is empty (can't infer guarantees)
208 if termsets.iter().any(|terms| terms.is_empty()) {
209 return builder;
210 }
211
212 // Find columns that appear in all termsets
213 let common_cols = find_common_columns(&termsets);
214 if common_cols.is_empty() {
215 return builder;
216 }
217
218 // Build guarantees for common columns
219 let mut builder = builder;
220 for col in common_cols {
221 let literals: Vec<_> = termsets
222 .iter()
223 .filter_map(|terms| {
224 terms.iter().find(|term| term.col() == col).map(
225 |term| {
226 term.lits().into_iter().map(|lit| lit.value())
227 },
228 )
229 })
230 .flatten()
231 .collect();
232
233 builder = builder.aggregate_multi_conjunct(
234 col,
235 Guarantee::In,
236 literals.into_iter(),
237 );
238 }
239
240 builder
241 }
242 }
243 })
244 .build()
245 }
246}
247
248impl Display for LiteralGuarantee {
249 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
250 let mut sorted_literals: Vec<_> =
251 self.literals.iter().map(|lit| lit.to_string()).collect();
252 sorted_literals.sort();
253 match self.guarantee {
254 Guarantee::In => write!(
255 f,
256 "{} in ({})",
257 self.column.name,
258 sorted_literals.join(", ")
259 ),
260 Guarantee::NotIn => write!(
261 f,
262 "{} not in ({})",
263 self.column.name,
264 sorted_literals.join(", ")
265 ),
266 }
267 }
268}
269
270/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
271/// preserving insert order
272#[derive(Debug, Default)]
273struct GuaranteeBuilder<'a> {
274 /// List of guarantees that have been created so far
275 /// if we have determined a subsequent conjunct invalidates a guarantee
276 /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
277 guarantees: Vec<Option<LiteralGuarantee>>,
278
279 /// Key is the (column name, guarantee type)
280 /// Value is the index into `guarantees`
281 map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
282}
283
284impl<'a> GuaranteeBuilder<'a> {
285 fn new() -> Self {
286 Default::default()
287 }
288
289 /// Aggregate a new single `AND col <op> literal` term to this builder
290 /// combining with existing guarantees if possible.
291 ///
292 /// # Examples
293 /// * `AND (a = 1)`: `a` is guaranteed to be 1
294 /// * `AND (a != 1)`: a is guaranteed to not be 1
295 fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
296 self.aggregate_multi_conjunct(
297 col_op_lit.col,
298 col_op_lit.guarantee,
299 [col_op_lit.lit.value()],
300 )
301 }
302
303 /// Aggregates a new single column, multi literal term to this builder
304 /// combining with previously known guarantees if possible.
305 ///
306 /// # Examples
307 /// For the following examples, we can guarantee the expression is `true` if:
308 /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
309 /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
310 /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
311 /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
312 fn aggregate_multi_conjunct(
313 mut self,
314 col: &'a crate::expressions::Column,
315 guarantee: Guarantee,
316 new_values: impl IntoIterator<Item = &'a ScalarValue>,
317 ) -> Self {
318 let key = (col, guarantee);
319 if let Some(index) = self.map.get(&key) {
320 // already have a guarantee for this column
321 let entry = &mut self.guarantees[*index];
322
323 let Some(existing) = entry else {
324 // determined the previous guarantee for this column has been
325 // invalidated, nothing to do
326 return self;
327 };
328
329 // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
330 // AND a = bar` doesn't make logical sense so we don't optimize this
331 // case
332 match existing.guarantee {
333 // knew that the column could not be a set of values
334 //
335 // For example, if we previously had `a != 5` and now we see
336 // another `AND a != 6` we know that a must not be either 5 or 6
337 // for the expression to be true
338 Guarantee::NotIn => {
339 let new_values: HashSet<_> = new_values.into_iter().collect();
340 existing.literals.extend(new_values.into_iter().cloned());
341 }
342 Guarantee::In => {
343 let intersection = new_values
344 .into_iter()
345 .filter(|new_value| existing.literals.contains(*new_value))
346 .collect::<Vec<_>>();
347 // for an In guarantee, if the intersection is not empty, we can extend the guarantee
348 // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
349 // otherwise, we invalidate the guarantee
350 // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
351 if !intersection.is_empty() {
352 existing.literals = intersection.into_iter().cloned().collect();
353 } else {
354 // at least one was not, so invalidate the guarantee
355 *entry = None;
356 }
357 }
358 }
359 } else {
360 // This is a new guarantee
361 let new_values: HashSet<_> = new_values.into_iter().collect();
362
363 let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
364 // add it to the list of guarantees
365 self.guarantees.push(Some(guarantee));
366 self.map.insert(key, self.guarantees.len() - 1);
367 }
368
369 self
370 }
371
372 /// Return all guarantees that have been created so far
373 fn build(self) -> Vec<LiteralGuarantee> {
374 // filter out any guarantees that have been invalidated
375 self.guarantees.into_iter().flatten().collect()
376 }
377}
378
379/// Represents a single `col [not]in literal` expression
380struct ColOpLit<'a> {
381 col: &'a crate::expressions::Column,
382 guarantee: Guarantee,
383 lit: &'a crate::expressions::Literal,
384}
385
386impl<'a> ColOpLit<'a> {
387 /// Returns Some(ColOpLit) if the expression is either:
388 /// 1. `col <op> literal`
389 /// 2. `literal <op> col`
390 /// 3. operator is `=` or `!=`
391 ///
392 /// Returns None otherwise
393 fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
394 let binary_expr = expr
395 .as_any()
396 .downcast_ref::<crate::expressions::BinaryExpr>()?;
397
398 let (left, op, right) = (
399 binary_expr.left().as_any(),
400 binary_expr.op(),
401 binary_expr.right().as_any(),
402 );
403 let guarantee = match op {
404 Operator::Eq => Guarantee::In,
405 Operator::NotEq => Guarantee::NotIn,
406 _ => return None,
407 };
408 // col <op> literal
409 if let (Some(col), Some(lit)) = (
410 left.downcast_ref::<crate::expressions::Column>(),
411 right.downcast_ref::<crate::expressions::Literal>(),
412 ) {
413 Some(Self {
414 col,
415 guarantee,
416 lit,
417 })
418 }
419 // literal <op> col
420 else if let (Some(lit), Some(col)) = (
421 left.downcast_ref::<crate::expressions::Literal>(),
422 right.downcast_ref::<crate::expressions::Column>(),
423 ) {
424 Some(Self {
425 col,
426 guarantee,
427 lit,
428 })
429 } else {
430 None
431 }
432 }
433}
434
435/// Represents a single `col [not]in literal` expression
436struct ColInList<'a> {
437 col: &'a crate::expressions::Column,
438 guarantee: Guarantee,
439 list: Vec<&'a crate::expressions::Literal>,
440}
441
442impl<'a> ColInList<'a> {
443 /// Returns Some(ColInList) if the expression is either:
444 /// 1. `col <op> (literal1, literal2, ...)`
445 /// 3. operator is `in` or `not in`
446 ///
447 /// Returns None otherwise
448 fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
449 // Only support single-column inlist currently, multi-column inlist is not supported
450 let col = inlist
451 .expr()
452 .as_any()
453 .downcast_ref::<crate::expressions::Column>()?;
454
455 let literals = inlist
456 .list()
457 .iter()
458 .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
459 .collect::<Option<Vec<_>>>()?;
460
461 let guarantee = if inlist.negated() {
462 Guarantee::NotIn
463 } else {
464 Guarantee::In
465 };
466
467 Some(Self {
468 col,
469 guarantee,
470 list: literals,
471 })
472 }
473}
474
475/// Represents a single `col [not]in literal` expression or a single `col <op> literal` expression
476enum ColOpLitOrInList<'a> {
477 ColOpLit(ColOpLit<'a>),
478 ColInList(ColInList<'a>),
479}
480
481impl<'a> ColOpLitOrInList<'a> {
482 fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
483 match expr
484 .as_any()
485 .downcast_ref::<crate::expressions::InListExpr>()
486 {
487 Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
488 None => ColOpLit::try_new(expr).map(Self::ColOpLit),
489 }
490 }
491
492 fn guarantee(&self) -> Guarantee {
493 match self {
494 Self::ColOpLit(col_op_lit) => col_op_lit.guarantee,
495 Self::ColInList(col_in_list) => col_in_list.guarantee,
496 }
497 }
498
499 fn col(&self) -> &'a crate::expressions::Column {
500 match self {
501 Self::ColOpLit(col_op_lit) => col_op_lit.col,
502 Self::ColInList(col_in_list) => col_in_list.col,
503 }
504 }
505
506 fn lits(&self) -> Vec<&'a crate::expressions::Literal> {
507 match self {
508 Self::ColOpLit(col_op_lit) => vec![col_op_lit.lit],
509 Self::ColInList(col_in_list) => col_in_list.list.clone(),
510 }
511 }
512}
513
514/// Find columns that appear in all termsets
515fn find_common_columns<'a>(
516 termsets: &[Vec<ColOpLitOrInList<'a>>],
517) -> Vec<&'a crate::expressions::Column> {
518 if termsets.is_empty() {
519 return Vec::new();
520 }
521
522 // Start with columns from the first termset
523 let mut common_cols: HashSet<_> = termsets[0].iter().map(|term| term.col()).collect();
524
525 // check if any common_col in one termset occur many times
526 // e.g. (a = 1 AND a = 2) OR (a = 2 AND b = 3), should not infer a guarantee
527 // TODO: for above case, we can infer a IN (2) AND b IN (3)
528 if common_cols.len() != termsets[0].len() {
529 return Vec::new();
530 }
531
532 // Intersect with columns from remaining termsets
533 for termset in termsets.iter().skip(1) {
534 let termset_cols: HashSet<_> = termset.iter().map(|term| term.col()).collect();
535 if termset_cols.len() != termset.len() {
536 return Vec::new();
537 }
538 common_cols = common_cols.intersection(&termset_cols).cloned().collect();
539 }
540
541 common_cols.into_iter().collect()
542}
543
544#[cfg(test)]
545mod test {
546 use std::sync::LazyLock;
547
548 use super::*;
549 use crate::planner::logical2physical;
550
551 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
552 use datafusion_expr::expr_fn::*;
553 use datafusion_expr::{lit, Expr};
554
555 use itertools::Itertools;
556
557 #[test]
558 fn test_literal() {
559 // a single literal offers no guarantee
560 test_analyze(lit(true), vec![])
561 }
562
563 #[test]
564 fn test_single() {
565 // a = "foo"
566 test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
567 // "foo" = a
568 test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
569 // a != "foo"
570 test_analyze(
571 col("a").not_eq(lit("foo")),
572 vec![not_in_guarantee("a", ["foo"])],
573 );
574 // "foo" != a
575 test_analyze(
576 lit("foo").not_eq(col("a")),
577 vec![not_in_guarantee("a", ["foo"])],
578 );
579 }
580
581 #[test]
582 fn test_conjunction_single_column() {
583 // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
584 test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
585 // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
586 test_analyze(
587 col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
588 vec![
589 // can only be true of b is 1 and b is not 2 (even though it is redundant)
590 in_guarantee("b", [1]),
591 not_in_guarantee("b", [2]),
592 ],
593 );
594 // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
595 test_analyze(
596 col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
597 vec![
598 // can only be true of b is not 1 and b is 2 (even though it is redundant)
599 not_in_guarantee("b", [1]),
600 in_guarantee("b", [2]),
601 ],
602 );
603 // b != 1 AND b != 2
604 test_analyze(
605 col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
606 vec![not_in_guarantee("b", [1, 2])],
607 );
608 // b != 1 AND b != 2 and b != 3
609 test_analyze(
610 col("b")
611 .not_eq(lit(1))
612 .and(col("b").not_eq(lit(2)))
613 .and(col("b").not_eq(lit(3))),
614 vec![not_in_guarantee("b", [1, 2, 3])],
615 );
616 // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
617 test_analyze(
618 col("b")
619 .not_eq(lit(1))
620 .and(col("b").eq(lit(2)))
621 .and(col("b").not_eq(lit(3))),
622 vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
623 );
624 // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
625 test_analyze(
626 col("b")
627 .not_eq(lit(1))
628 .and(col("b").not_eq(lit(2)))
629 .and(col("b").eq(lit(3))),
630 vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
631 );
632 // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
633 test_analyze(
634 col("b")
635 .not_eq(lit(1))
636 .and(col("b").not_eq(lit(2)))
637 .and(col("b").gt(lit(3))),
638 vec![not_in_guarantee("b", [1, 2])],
639 );
640 }
641
642 #[test]
643 fn test_conjunction_multi_column() {
644 // a = "foo" AND b = 1
645 test_analyze(
646 col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
647 vec![
648 // should find both column guarantees
649 in_guarantee("a", ["foo"]),
650 in_guarantee("b", [1]),
651 ],
652 );
653 // a != "foo" AND b != 1
654 test_analyze(
655 col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
656 // should find both column guarantees
657 vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
658 );
659 // a = "foo" AND a = "bar"
660 test_analyze(
661 col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
662 // this predicate is impossible ( can't be both foo and bar),
663 vec![],
664 );
665 // a = "foo" AND b != "bar"
666 test_analyze(
667 col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
668 vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
669 );
670 // a != "foo" AND a != "bar"
671 test_analyze(
672 col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
673 // know it isn't "foo" or "bar"
674 vec![not_in_guarantee("a", ["foo", "bar"])],
675 );
676 // a != "foo" AND a != "bar" and a != "baz"
677 test_analyze(
678 col("a")
679 .not_eq(lit("foo"))
680 .and(col("a").not_eq(lit("bar")))
681 .and(col("a").not_eq(lit("baz"))),
682 // know it isn't "foo" or "bar" or "baz"
683 vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
684 );
685 // a = "foo" AND a = "foo"
686 let expr = col("a").eq(lit("foo"));
687 test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
688 // b > 5 AND b = 10 (should get an b = 10 guarantee)
689 test_analyze(
690 col("b").gt(lit(5)).and(col("b").eq(lit(10))),
691 vec![in_guarantee("b", [10])],
692 );
693 // b > 10 AND b = 10 (this is impossible)
694 test_analyze(
695 col("b").gt(lit(10)).and(col("b").eq(lit(10))),
696 vec![
697 // if b isn't 10, it can not be true (though the expression actually can never be true)
698 in_guarantee("b", [10]),
699 ],
700 );
701 // a != "foo" and (a != "bar" OR a != "baz")
702 test_analyze(
703 col("a")
704 .not_eq(lit("foo"))
705 .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
706 // a is not foo (we can't represent other knowledge about a)
707 vec![not_in_guarantee("a", ["foo"])],
708 );
709 }
710
711 #[test]
712 fn test_conjunction_and_disjunction_single_column() {
713 // b != 1 AND (b > 2)
714 test_analyze(
715 col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
716 vec![
717 // for the expression to be true, b can not be one
718 not_in_guarantee("b", [1]),
719 ],
720 );
721
722 // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
723 test_analyze(
724 col("b")
725 .eq(lit(1))
726 .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
727 vec![
728 // in theory, b must be 1 and one of 2,3 for this expression to be true
729 // which is a logical contradiction
730 ],
731 );
732 }
733
734 #[test]
735 fn test_disjunction_single_column() {
736 // b = 1 OR b = 2
737 test_analyze(
738 col("b").eq(lit(1)).or(col("b").eq(lit(2))),
739 vec![in_guarantee("b", [1, 2])],
740 );
741 // b != 1 OR b = 2
742 test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
743 // b = 1 OR b != 2
744 test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
745 // b != 1 OR b != 2
746 test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
747 // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
748 test_analyze(
749 col("b")
750 .not_eq(lit(1))
751 .or(col("b").not_eq(lit(2)))
752 .or(lit("b").eq(lit(3))),
753 vec![],
754 );
755 // b = 1 OR b = 2 OR b = 3
756 test_analyze(
757 col("b")
758 .eq(lit(1))
759 .or(col("b").eq(lit(2)))
760 .or(col("b").eq(lit(3))),
761 vec![in_guarantee("b", [1, 2, 3])],
762 );
763 // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
764 test_analyze(
765 col("b")
766 .eq(lit(1))
767 .or(col("b").eq(lit(2)))
768 .or(lit("b").eq(lit(3))),
769 vec![],
770 );
771 }
772
773 #[test]
774 fn test_disjunction_multi_column() {
775 // a = "foo" OR b = 1
776 test_analyze(
777 col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
778 // no can't have a single column guarantee (if a = "foo" then b != 1) etc
779 vec![],
780 );
781 // a != "foo" OR b != 1
782 test_analyze(
783 col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
784 // No single column guarantee
785 vec![],
786 );
787 // a = "foo" OR a = "bar"
788 test_analyze(
789 col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
790 vec![in_guarantee("a", ["foo", "bar"])],
791 );
792 // a = "foo" OR a = "foo"
793 test_analyze(
794 col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
795 vec![in_guarantee("a", ["foo"])],
796 );
797 // a != "foo" OR a != "bar"
798 test_analyze(
799 col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
800 // can't represent knowledge about a in this case
801 vec![],
802 );
803 // a = "foo" OR a = "bar" OR a = "baz"
804 test_analyze(
805 col("a")
806 .eq(lit("foo"))
807 .or(col("a").eq(lit("bar")))
808 .or(col("a").eq(lit("baz"))),
809 vec![in_guarantee("a", ["foo", "bar", "baz"])],
810 );
811 // (a = "foo" OR a = "bar") AND (a = "baz)"
812 test_analyze(
813 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
814 .and(col("a").eq(lit("baz"))),
815 // this could potentially be represented as 2 constraints with a more
816 // sophisticated analysis
817 vec![],
818 );
819 // (a = "foo" OR a = "bar") AND (b = 1)
820 test_analyze(
821 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
822 .and(col("b").eq(lit(1))),
823 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
824 );
825 // (a = "foo" OR a = "bar") OR (b = 1)
826 test_analyze(
827 col("a")
828 .eq(lit("foo"))
829 .or(col("a").eq(lit("bar")))
830 .or(col("b").eq(lit(1))),
831 // can't represent knowledge about a or b in this case
832 vec![],
833 );
834 }
835
836 #[test]
837 fn test_single_inlist() {
838 // b IN (1, 2, 3)
839 test_analyze(
840 col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
841 vec![in_guarantee("b", [1, 2, 3])],
842 );
843 // b NOT IN (1, 2, 3)
844 test_analyze(
845 col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
846 vec![not_in_guarantee("b", [1, 2, 3])],
847 );
848 // b IN (1,2,3,4...24)
849 test_analyze(
850 col("b").in_list((1..25).map(lit).collect_vec(), false),
851 vec![in_guarantee("b", 1..25)],
852 );
853 }
854
855 #[test]
856 fn test_inlist_conjunction() {
857 // b IN (1, 2, 3) AND b IN (2, 3, 4)
858 test_analyze(
859 col("b")
860 .in_list(vec![lit(1), lit(2), lit(3)], false)
861 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
862 vec![in_guarantee("b", [2, 3])],
863 );
864 // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
865 test_analyze(
866 col("b")
867 .in_list(vec![lit(1), lit(2), lit(3)], true)
868 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
869 vec![
870 not_in_guarantee("b", [1, 2, 3]),
871 in_guarantee("b", [2, 3, 4]),
872 ],
873 );
874 // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
875 test_analyze(
876 col("b")
877 .in_list(vec![lit(1), lit(2), lit(3)], true)
878 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
879 vec![not_in_guarantee("b", [1, 2, 3, 4])],
880 );
881 // b IN (1, 2, 3) AND b = 4
882 test_analyze(
883 col("b")
884 .in_list(vec![lit(1), lit(2), lit(3)], false)
885 .and(col("b").eq(lit(4))),
886 vec![],
887 );
888 // b IN (1, 2, 3) AND b = 2
889 test_analyze(
890 col("b")
891 .in_list(vec![lit(1), lit(2), lit(3)], false)
892 .and(col("b").eq(lit(2))),
893 vec![in_guarantee("b", [2])],
894 );
895 // b IN (1, 2, 3) AND b != 2
896 test_analyze(
897 col("b")
898 .in_list(vec![lit(1), lit(2), lit(3)], false)
899 .and(col("b").not_eq(lit(2))),
900 vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
901 );
902 // b NOT IN (1, 2, 3) AND b != 4
903 test_analyze(
904 col("b")
905 .in_list(vec![lit(1), lit(2), lit(3)], true)
906 .and(col("b").not_eq(lit(4))),
907 vec![not_in_guarantee("b", [1, 2, 3, 4])],
908 );
909 // b NOT IN (1, 2, 3) AND b != 2
910 test_analyze(
911 col("b")
912 .in_list(vec![lit(1), lit(2), lit(3)], true)
913 .and(col("b").not_eq(lit(2))),
914 vec![not_in_guarantee("b", [1, 2, 3])],
915 );
916 }
917
918 #[test]
919 fn test_inlist_with_disjunction() {
920 // b IN (1, 2, 3) AND (b = 3 OR b = 4)
921 test_analyze(
922 col("b")
923 .in_list(vec![lit(1), lit(2), lit(3)], false)
924 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
925 vec![in_guarantee("b", [3])],
926 );
927 // b IN (1, 2, 3) AND (b = 4 OR b = 5)
928 test_analyze(
929 col("b")
930 .in_list(vec![lit(1), lit(2), lit(3)], false)
931 .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
932 vec![],
933 );
934 // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
935 test_analyze(
936 col("b")
937 .in_list(vec![lit(1), lit(2), lit(3)], true)
938 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
939 vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
940 );
941 // b IN (1, 2, 3) OR b = 2
942 test_analyze(
943 col("b")
944 .in_list(vec![lit(1), lit(2), lit(3)], false)
945 .or(col("b").eq(lit(2))),
946 vec![in_guarantee("b", [1, 2, 3])],
947 );
948 // b IN (1, 2, 3) OR b != 3
949 test_analyze(
950 col("b")
951 .in_list(vec![lit(1), lit(2), lit(3)], false)
952 .or(col("b").not_eq(lit(3))),
953 vec![],
954 );
955 }
956
957 #[test]
958 fn test_disjunction_and_conjunction_multi_column() {
959 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2)
960 test_analyze(
961 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
962 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
963 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1, 2])],
964 );
965 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (b = 3)
966 test_analyze(
967 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
968 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
969 .or(col("b").eq(lit(3))),
970 vec![in_guarantee("b", [1, 2, 3])],
971 );
972 // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (c = 3)
973 test_analyze(
974 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
975 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
976 .or(col("c").eq(lit(3))),
977 vec![],
978 );
979 // (a = "foo" AND b > 1) OR (a = "bar" AND b = 2)
980 test_analyze(
981 (col("a").eq(lit("foo")).and(col("b").gt(lit(1))))
982 .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
983 vec![in_guarantee("a", ["foo", "bar"])],
984 );
985 // (a = "foo" AND b = 1) OR (b = 1 AND c = 2) OR (c = 3 AND a = "bar")
986 test_analyze(
987 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
988 .or(col("b").eq(lit(1)).and(col("c").eq(lit(2))))
989 .or(col("c").eq(lit(3)).and(col("a").eq(lit("bar")))),
990 vec![],
991 );
992 // (a = "foo" AND a = "bar") OR (a = "good" AND b = 1)
993 // TODO: this should be `a IN ("good") AND b IN (1)`
994 test_analyze(
995 (col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))))
996 .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
997 vec![],
998 );
999 // (a = "foo" AND a = "foo") OR (a = "good" AND b = 1)
1000 // TODO: this should be `a IN ("foo", "good")`
1001 test_analyze(
1002 (col("a").eq(lit("foo")).and(col("a").eq(lit("foo"))))
1003 .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
1004 vec![],
1005 );
1006 // (a = "foo" AND b = 3) OR (b = 4 AND b = 1) OR (b = 2 AND a = "bar")
1007 test_analyze(
1008 (col("a").eq(lit("foo")).and(col("b").eq(lit(3))))
1009 .or(col("b").eq(lit(4)).and(col("b").eq(lit(1))))
1010 .or(col("b").eq(lit(2)).and(col("a").eq(lit("bar")))),
1011 vec![],
1012 );
1013 // (b = 1 AND b > 3) OR (a = "foo" AND b = 4)
1014 test_analyze(
1015 (col("b").eq(lit(1)).and(col("b").gt(lit(3))))
1016 .or(col("a").eq(lit("foo")).and(col("b").eq(lit(4)))),
1017 // if b isn't 1 or 4, it can not be true (though the expression actually can never be true)
1018 vec![in_guarantee("b", [1, 4])],
1019 );
1020 // (a = "foo" AND b = 1) OR (a != "bar" AND b = 2)
1021 test_analyze(
1022 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1023 .or(col("a").not_eq(lit("bar")).and(col("b").eq(lit(2)))),
1024 vec![in_guarantee("b", [1, 2])],
1025 );
1026 // (a = "foo" AND b = 1) OR (a LIKE "%bar" AND b = 2)
1027 test_analyze(
1028 (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
1029 .or(col("a").like(lit("%bar")).and(col("b").eq(lit(2)))),
1030 vec![in_guarantee("b", [1, 2])],
1031 );
1032 // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo", "bar") AND b = 6)
1033 test_analyze(
1034 (col("a")
1035 .in_list(vec![lit("foo"), lit("bar")], false)
1036 .and(col("b").eq(lit(5))))
1037 .or(col("a")
1038 .in_list(vec![lit("foo"), lit("bar")], false)
1039 .and(col("b").eq(lit(6)))),
1040 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1041 );
1042 // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo") AND b = 6)
1043 test_analyze(
1044 (col("a")
1045 .in_list(vec![lit("foo"), lit("bar")], false)
1046 .and(col("b").eq(lit(5))))
1047 .or(col("a")
1048 .in_list(vec![lit("foo")], false)
1049 .and(col("b").eq(lit(6)))),
1050 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
1051 );
1052 // (a NOT IN ("foo", "bar") AND b = 5) OR (a NOT IN ("foo") AND b = 6)
1053 test_analyze(
1054 (col("a")
1055 .in_list(vec![lit("foo"), lit("bar")], true)
1056 .and(col("b").eq(lit(5))))
1057 .or(col("a")
1058 .in_list(vec![lit("foo")], true)
1059 .and(col("b").eq(lit(6)))),
1060 vec![in_guarantee("b", [5, 6])],
1061 );
1062 }
1063
1064 /// Tests that analyzing expr results in the expected guarantees
1065 fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
1066 println!("Begin analyze of {expr}");
1067 let schema = schema();
1068 let physical_expr = logical2physical(&expr, &schema);
1069
1070 let actual = LiteralGuarantee::analyze(&physical_expr)
1071 .into_iter()
1072 .sorted_by_key(|g| g.column.name().to_string())
1073 .collect::<Vec<_>>();
1074 assert_eq!(
1075 expected, actual,
1076 "expr: {expr}\
1077 \n\nexpected: {expected:#?}\
1078 \n\nactual: {actual:#?}\
1079 \n\nexpr: {expr:#?}\
1080 \n\nphysical_expr: {physical_expr:#?}"
1081 );
1082 }
1083
1084 /// Guarantee that the expression is true if the column is one of the specified values
1085 fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1086 where
1087 I: IntoIterator<Item = S>,
1088 S: Into<ScalarValue> + 'a,
1089 {
1090 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1091 LiteralGuarantee::new(column, Guarantee::In, literals.iter())
1092 }
1093
1094 /// Guarantee that the expression is true if the column is NOT any of the specified values
1095 fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
1096 where
1097 I: IntoIterator<Item = S>,
1098 S: Into<ScalarValue> + 'a,
1099 {
1100 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
1101 LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
1102 }
1103
1104 // Schema for testing
1105 fn schema() -> SchemaRef {
1106 static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
1107 Arc::new(Schema::new(vec![
1108 Field::new("a", DataType::Utf8, false),
1109 Field::new("b", DataType::Int32, false),
1110 Field::new("c", DataType::Int32, false),
1111 ]))
1112 });
1113 Arc::clone(&SCHEMA)
1114 }
1115}