datafusion_common/
stats.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! This module provides data structures to represent statistics
19
20use std::fmt::{self, Debug, Display};
21
22use crate::{Result, ScalarValue};
23
24use crate::error::_plan_err;
25use arrow::datatypes::{DataType, Schema, SchemaRef};
26
27/// Represents a value with a degree of certainty. `Precision` is used to
28/// propagate information the precision of statistical values.
29#[derive(Clone, PartialEq, Eq, Default, Copy)]
30pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
31    /// The exact value is known
32    Exact(T),
33    /// The value is not known exactly, but is likely close to this value
34    Inexact(T),
35    /// Nothing is known about the value
36    #[default]
37    Absent,
38}
39
40impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
41    /// If we have some value (exact or inexact), it returns that value.
42    /// Otherwise, it returns `None`.
43    pub fn get_value(&self) -> Option<&T> {
44        match self {
45            Precision::Exact(value) | Precision::Inexact(value) => Some(value),
46            Precision::Absent => None,
47        }
48    }
49
50    /// Transform the value in this [`Precision`] object, if one exists, using
51    /// the given function. Preserves the exactness state.
52    pub fn map<U, F>(self, f: F) -> Precision<U>
53    where
54        F: Fn(T) -> U,
55        U: Debug + Clone + PartialEq + Eq + PartialOrd,
56    {
57        match self {
58            Precision::Exact(val) => Precision::Exact(f(val)),
59            Precision::Inexact(val) => Precision::Inexact(f(val)),
60            _ => Precision::<U>::Absent,
61        }
62    }
63
64    /// Returns `Some(true)` if we have an exact value, `Some(false)` if we
65    /// have an inexact value, and `None` if there is no value.
66    pub fn is_exact(&self) -> Option<bool> {
67        match self {
68            Precision::Exact(_) => Some(true),
69            Precision::Inexact(_) => Some(false),
70            _ => None,
71        }
72    }
73
74    /// Returns the maximum of two (possibly inexact) values, conservatively
75    /// propagating exactness information. If one of the input values is
76    /// [`Precision::Absent`], the result is `Absent` too.
77    pub fn max(&self, other: &Precision<T>) -> Precision<T> {
78        match (self, other) {
79            (Precision::Exact(a), Precision::Exact(b)) => {
80                Precision::Exact(if a >= b { a.clone() } else { b.clone() })
81            }
82            (Precision::Inexact(a), Precision::Exact(b))
83            | (Precision::Exact(a), Precision::Inexact(b))
84            | (Precision::Inexact(a), Precision::Inexact(b)) => {
85                Precision::Inexact(if a >= b { a.clone() } else { b.clone() })
86            }
87            (_, _) => Precision::Absent,
88        }
89    }
90
91    /// Returns the minimum of two (possibly inexact) values, conservatively
92    /// propagating exactness information. If one of the input values is
93    /// [`Precision::Absent`], the result is `Absent` too.
94    pub fn min(&self, other: &Precision<T>) -> Precision<T> {
95        match (self, other) {
96            (Precision::Exact(a), Precision::Exact(b)) => {
97                Precision::Exact(if a >= b { b.clone() } else { a.clone() })
98            }
99            (Precision::Inexact(a), Precision::Exact(b))
100            | (Precision::Exact(a), Precision::Inexact(b))
101            | (Precision::Inexact(a), Precision::Inexact(b)) => {
102                Precision::Inexact(if a >= b { b.clone() } else { a.clone() })
103            }
104            (_, _) => Precision::Absent,
105        }
106    }
107
108    /// Demotes the precision state from exact to inexact (if present).
109    pub fn to_inexact(self) -> Self {
110        match self {
111            Precision::Exact(value) => Precision::Inexact(value),
112            _ => self,
113        }
114    }
115}
116
117impl Precision<usize> {
118    /// Calculates the sum of two (possibly inexact) [`usize`] values,
119    /// conservatively propagating exactness information. If one of the input
120    /// values is [`Precision::Absent`], the result is `Absent` too.
121    pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
122        match (self, other) {
123            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a + b),
124            (Precision::Inexact(a), Precision::Exact(b))
125            | (Precision::Exact(a), Precision::Inexact(b))
126            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a + b),
127            (_, _) => Precision::Absent,
128        }
129    }
130
131    /// Calculates the difference of two (possibly inexact) [`usize`] values,
132    /// conservatively propagating exactness information. If one of the input
133    /// values is [`Precision::Absent`], the result is `Absent` too.
134    pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
135        match (self, other) {
136            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a - b),
137            (Precision::Inexact(a), Precision::Exact(b))
138            | (Precision::Exact(a), Precision::Inexact(b))
139            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a - b),
140            (_, _) => Precision::Absent,
141        }
142    }
143
144    /// Calculates the multiplication of two (possibly inexact) [`usize`] values,
145    /// conservatively propagating exactness information. If one of the input
146    /// values is [`Precision::Absent`], the result is `Absent` too.
147    pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
148        match (self, other) {
149            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a * b),
150            (Precision::Inexact(a), Precision::Exact(b))
151            | (Precision::Exact(a), Precision::Inexact(b))
152            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a * b),
153            (_, _) => Precision::Absent,
154        }
155    }
156
157    /// Return the estimate of applying a filter with estimated selectivity
158    /// `selectivity` to this Precision. A selectivity of `1.0` means that all
159    /// rows are selected. A selectivity of `0.5` means half the rows are
160    /// selected. Will always return inexact statistics.
161    pub fn with_estimated_selectivity(self, selectivity: f64) -> Self {
162        self.map(|v| ((v as f64 * selectivity).ceil()) as usize)
163            .to_inexact()
164    }
165}
166
167impl Precision<ScalarValue> {
168    /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
169    /// conservatively propagating exactness information. If one of the input
170    /// values is [`Precision::Absent`], the result is `Absent` too.
171    pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
172        match (self, other) {
173            (Precision::Exact(a), Precision::Exact(b)) => {
174                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
175            }
176            (Precision::Inexact(a), Precision::Exact(b))
177            | (Precision::Exact(a), Precision::Inexact(b))
178            | (Precision::Inexact(a), Precision::Inexact(b)) => a
179                .add(b)
180                .map(Precision::Inexact)
181                .unwrap_or(Precision::Absent),
182            (_, _) => Precision::Absent,
183        }
184    }
185
186    /// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
187    /// conservatively propagating exactness information. If one of the input
188    /// values is [`Precision::Absent`], the result is `Absent` too.
189    pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
190        match (self, other) {
191            (Precision::Exact(a), Precision::Exact(b)) => {
192                a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
193            }
194            (Precision::Inexact(a), Precision::Exact(b))
195            | (Precision::Exact(a), Precision::Inexact(b))
196            | (Precision::Inexact(a), Precision::Inexact(b)) => a
197                .sub(b)
198                .map(Precision::Inexact)
199                .unwrap_or(Precision::Absent),
200            (_, _) => Precision::Absent,
201        }
202    }
203
204    /// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
205    /// conservatively propagating exactness information. If one of the input
206    /// values is [`Precision::Absent`], the result is `Absent` too.
207    pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
208        match (self, other) {
209            (Precision::Exact(a), Precision::Exact(b)) => a
210                .mul_checked(b)
211                .map(Precision::Exact)
212                .unwrap_or(Precision::Absent),
213            (Precision::Inexact(a), Precision::Exact(b))
214            | (Precision::Exact(a), Precision::Inexact(b))
215            | (Precision::Inexact(a), Precision::Inexact(b)) => a
216                .mul_checked(b)
217                .map(Precision::Inexact)
218                .unwrap_or(Precision::Absent),
219            (_, _) => Precision::Absent,
220        }
221    }
222
223    /// Casts the value to the given data type, propagating exactness information.
224    pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
225        match self {
226            Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
227            Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
228            Precision::Absent => Ok(Precision::Absent),
229        }
230    }
231}
232
233impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
234    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
235        match self {
236            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
237            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
238            Precision::Absent => write!(f, "Absent"),
239        }
240    }
241}
242
243impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
244    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
245        match self {
246            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
247            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
248            Precision::Absent => write!(f, "Absent"),
249        }
250    }
251}
252
253impl From<Precision<usize>> for Precision<ScalarValue> {
254    fn from(value: Precision<usize>) -> Self {
255        match value {
256            Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
257            Precision::Inexact(v) => {
258                Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
259            }
260            Precision::Absent => Precision::Absent,
261        }
262    }
263}
264
265/// Statistics for a relation
266/// Fields are optional and can be inexact because the sources
267/// sometimes provide approximate estimates for performance reasons
268/// and the transformations output are not always predictable.
269#[derive(Debug, Clone, PartialEq, Eq)]
270pub struct Statistics {
271    /// The number of table rows.
272    pub num_rows: Precision<usize>,
273    /// Total bytes of the table rows.
274    pub total_byte_size: Precision<usize>,
275    /// Statistics on a column level.
276    ///
277    /// It must contains a [`ColumnStatistics`] for each field in the schema of
278    /// the table to which the [`Statistics`] refer.
279    pub column_statistics: Vec<ColumnStatistics>,
280}
281
282impl Default for Statistics {
283    /// Returns a new [`Statistics`] instance with all fields set to unknown
284    /// and no columns.
285    fn default() -> Self {
286        Self {
287            num_rows: Precision::Absent,
288            total_byte_size: Precision::Absent,
289            column_statistics: vec![],
290        }
291    }
292}
293
294impl Statistics {
295    /// Returns a [`Statistics`] instance for the given schema by assigning
296    /// unknown statistics to each column in the schema.
297    pub fn new_unknown(schema: &Schema) -> Self {
298        Self {
299            num_rows: Precision::Absent,
300            total_byte_size: Precision::Absent,
301            column_statistics: Statistics::unknown_column(schema),
302        }
303    }
304
305    /// Returns an unbounded `ColumnStatistics` for each field in the schema.
306    pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
307        schema
308            .fields()
309            .iter()
310            .map(|_| ColumnStatistics::new_unknown())
311            .collect()
312    }
313
314    /// Set the number of rows
315    pub fn with_num_rows(mut self, num_rows: Precision<usize>) -> Self {
316        self.num_rows = num_rows;
317        self
318    }
319
320    /// Set the total size, in bytes
321    pub fn with_total_byte_size(mut self, total_byte_size: Precision<usize>) -> Self {
322        self.total_byte_size = total_byte_size;
323        self
324    }
325
326    /// Add a column to the column statistics
327    pub fn add_column_statistics(mut self, column_stats: ColumnStatistics) -> Self {
328        self.column_statistics.push(column_stats);
329        self
330    }
331
332    /// If the exactness of a [`Statistics`] instance is lost, this function relaxes
333    /// the exactness of all information by converting them [`Precision::Inexact`].
334    pub fn to_inexact(mut self) -> Self {
335        self.num_rows = self.num_rows.to_inexact();
336        self.total_byte_size = self.total_byte_size.to_inexact();
337        self.column_statistics = self
338            .column_statistics
339            .into_iter()
340            .map(|s| s.to_inexact())
341            .collect();
342        self
343    }
344
345    /// Project the statistics to the given column indices.
346    ///
347    /// For example, if we had statistics for columns `{"a", "b", "c"}`,
348    /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
349    /// "b"}`.
350    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
351        let Some(projection) = projection else {
352            return self;
353        };
354
355        #[allow(clippy::large_enum_variant)]
356        enum Slot {
357            /// The column is taken and put into the specified statistics location
358            Taken(usize),
359            /// The original columns is present
360            Present(ColumnStatistics),
361        }
362
363        // Convert to Vec<Slot> so we can avoid copying the statistics
364        let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
365            .into_iter()
366            .map(Slot::Present)
367            .collect();
368
369        for idx in projection {
370            let next_idx = self.column_statistics.len();
371            let slot = std::mem::replace(
372                columns.get_mut(*idx).expect("projection out of bounds"),
373                Slot::Taken(next_idx),
374            );
375            match slot {
376                // The column was there, so just move it
377                Slot::Present(col) => self.column_statistics.push(col),
378                // The column was taken, so copy from the previous location
379                Slot::Taken(prev_idx) => self
380                    .column_statistics
381                    .push(self.column_statistics[prev_idx].clone()),
382            }
383        }
384
385        self
386    }
387
388    /// Calculates the statistics after applying `fetch` and `skip` operations.
389    ///
390    /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
391    /// parameter to compute global statistics in a multi-partition setting.
392    pub fn with_fetch(
393        mut self,
394        schema: SchemaRef,
395        fetch: Option<usize>,
396        skip: usize,
397        n_partitions: usize,
398    ) -> Result<Self> {
399        let fetch_val = fetch.unwrap_or(usize::MAX);
400
401        self.num_rows = match self {
402            Statistics {
403                num_rows: Precision::Exact(nr),
404                ..
405            }
406            | Statistics {
407                num_rows: Precision::Inexact(nr),
408                ..
409            } => {
410                // Here, the inexact case gives us an upper bound on the number of rows.
411                if nr <= skip {
412                    // All input data will be skipped:
413                    Precision::Exact(0)
414                } else if nr <= fetch_val && skip == 0 {
415                    // If the input does not reach the `fetch` globally, and `skip`
416                    // is zero (meaning the input and output are identical), return
417                    // input stats as is.
418                    // TODO: Can input stats still be used, but adjusted, when `skip`
419                    //       is non-zero?
420                    return Ok(self);
421                } else if nr - skip <= fetch_val {
422                    // After `skip` input rows are skipped, the remaining rows are
423                    // less than or equal to the `fetch` values, so `num_rows` must
424                    // equal the remaining rows.
425                    check_num_rows(
426                        (nr - skip).checked_mul(n_partitions),
427                        // We know that we have an estimate for the number of rows:
428                        self.num_rows.is_exact().unwrap(),
429                    )
430                } else {
431                    // At this point we know that we were given a `fetch` value
432                    // as the `None` case would go into the branch above. Since
433                    // the input has more rows than `fetch + skip`, the number
434                    // of rows will be the `fetch`, but we won't be able to
435                    // predict the other statistics.
436                    check_num_rows(
437                        fetch_val.checked_mul(n_partitions),
438                        // We know that we have an estimate for the number of rows:
439                        self.num_rows.is_exact().unwrap(),
440                    )
441                }
442            }
443            Statistics {
444                num_rows: Precision::Absent,
445                ..
446            } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
447        };
448        self.column_statistics = Statistics::unknown_column(&schema);
449        self.total_byte_size = Precision::Absent;
450        Ok(self)
451    }
452
453    /// Summarize zero or more statistics into a single `Statistics` instance.
454    ///
455    /// The method assumes that all statistics are for the same schema.
456    /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
457    ///
458    /// Returns an error if the statistics do not match the specified schemas.
459    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
460    where
461        I: IntoIterator<Item = &'a Statistics>,
462    {
463        let mut items = items.into_iter();
464
465        let Some(init) = items.next() else {
466            return Ok(Statistics::new_unknown(schema));
467        };
468        items.try_fold(init.clone(), |acc: Statistics, item_stats: &Statistics| {
469            acc.try_merge(item_stats)
470        })
471    }
472
473    /// Merge this Statistics value with another Statistics value.
474    ///
475    /// Returns an error if the statistics do not match (different schemas).
476    ///
477    /// # Example
478    /// ```
479    /// # use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
480    /// # use arrow::datatypes::{Field, Schema, DataType};
481    /// # use datafusion_common::stats::Precision;
482    /// let stats1 = Statistics::default()
483    ///   .with_num_rows(Precision::Exact(1))
484    ///   .with_total_byte_size(Precision::Exact(2))
485    ///   .add_column_statistics(ColumnStatistics::new_unknown()
486    ///      .with_null_count(Precision::Exact(3))
487    ///      .with_min_value(Precision::Exact(ScalarValue::from(4)))
488    ///      .with_max_value(Precision::Exact(ScalarValue::from(5)))
489    ///   );
490    ///
491    /// let stats2 = Statistics::default()
492    ///   .with_num_rows(Precision::Exact(10))
493    ///   .with_total_byte_size(Precision::Inexact(20))
494    ///   .add_column_statistics(ColumnStatistics::new_unknown()
495    ///       // absent null count
496    ///      .with_min_value(Precision::Exact(ScalarValue::from(40)))
497    ///      .with_max_value(Precision::Exact(ScalarValue::from(50)))
498    ///   );
499    ///
500    /// let merged_stats = stats1.try_merge(&stats2).unwrap();
501    /// let expected_stats = Statistics::default()
502    ///   .with_num_rows(Precision::Exact(11))
503    ///   .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact
504    ///   .add_column_statistics(
505    ///     ColumnStatistics::new_unknown()
506    ///       .with_null_count(Precision::Absent) // missing from stats2 --> absent
507    ///       .with_min_value(Precision::Exact(ScalarValue::from(4)))
508    ///       .with_max_value(Precision::Exact(ScalarValue::from(50)))
509    ///   );
510    ///
511    /// assert_eq!(merged_stats, expected_stats)
512    /// ```
513    pub fn try_merge(self, other: &Statistics) -> Result<Self> {
514        let Self {
515            mut num_rows,
516            mut total_byte_size,
517            mut column_statistics,
518        } = self;
519
520        // Accumulate statistics for subsequent items
521        num_rows = num_rows.add(&other.num_rows);
522        total_byte_size = total_byte_size.add(&other.total_byte_size);
523
524        if column_statistics.len() != other.column_statistics.len() {
525            return _plan_err!(
526                "Cannot merge statistics with different number of columns: {} vs {}",
527                column_statistics.len(),
528                other.column_statistics.len()
529            );
530        }
531
532        for (item_col_stats, col_stats) in other
533            .column_statistics
534            .iter()
535            .zip(column_statistics.iter_mut())
536        {
537            col_stats.null_count = col_stats.null_count.add(&item_col_stats.null_count);
538            col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
539            col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
540            col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
541            col_stats.distinct_count = Precision::Absent;
542        }
543
544        Ok(Statistics {
545            num_rows,
546            total_byte_size,
547            column_statistics,
548        })
549    }
550}
551
552/// Creates an estimate of the number of rows in the output using the given
553/// optional value and exactness flag.
554fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
555    if let Some(value) = value {
556        if is_exact {
557            Precision::Exact(value)
558        } else {
559            // If the input stats are inexact, so are the output stats.
560            Precision::Inexact(value)
561        }
562    } else {
563        // If the estimate is not available (e.g. due to an overflow), we can
564        // not produce a reliable estimate.
565        Precision::Absent
566    }
567}
568
569impl Display for Statistics {
570    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
571        // string of column statistics
572        let column_stats = self
573            .column_statistics
574            .iter()
575            .enumerate()
576            .map(|(i, cs)| {
577                let s = format!("(Col[{i}]:");
578                let s = if cs.min_value != Precision::Absent {
579                    format!("{} Min={}", s, cs.min_value)
580                } else {
581                    s
582                };
583                let s = if cs.max_value != Precision::Absent {
584                    format!("{} Max={}", s, cs.max_value)
585                } else {
586                    s
587                };
588                let s = if cs.sum_value != Precision::Absent {
589                    format!("{} Sum={}", s, cs.sum_value)
590                } else {
591                    s
592                };
593                let s = if cs.null_count != Precision::Absent {
594                    format!("{} Null={}", s, cs.null_count)
595                } else {
596                    s
597                };
598                let s = if cs.distinct_count != Precision::Absent {
599                    format!("{} Distinct={}", s, cs.distinct_count)
600                } else {
601                    s
602                };
603
604                s + ")"
605            })
606            .collect::<Vec<_>>()
607            .join(",");
608
609        write!(
610            f,
611            "Rows={}, Bytes={}, [{}]",
612            self.num_rows, self.total_byte_size, column_stats
613        )?;
614
615        Ok(())
616    }
617}
618
619/// Statistics for a column within a relation
620#[derive(Clone, Debug, PartialEq, Eq, Default)]
621pub struct ColumnStatistics {
622    /// Number of null values on column
623    pub null_count: Precision<usize>,
624    /// Maximum value of column
625    pub max_value: Precision<ScalarValue>,
626    /// Minimum value of column
627    pub min_value: Precision<ScalarValue>,
628    /// Sum value of a column
629    pub sum_value: Precision<ScalarValue>,
630    /// Number of distinct values
631    pub distinct_count: Precision<usize>,
632}
633
634impl ColumnStatistics {
635    /// Column contains a single non null value (e.g constant).
636    pub fn is_singleton(&self) -> bool {
637        match (&self.min_value, &self.max_value) {
638            // Min and max values are the same and not infinity.
639            (Precision::Exact(min), Precision::Exact(max)) => {
640                !min.is_null() && !max.is_null() && (min == max)
641            }
642            (_, _) => false,
643        }
644    }
645
646    /// Returns a [`ColumnStatistics`] instance having all [`Precision::Absent`] parameters.
647    pub fn new_unknown() -> Self {
648        Self {
649            null_count: Precision::Absent,
650            max_value: Precision::Absent,
651            min_value: Precision::Absent,
652            sum_value: Precision::Absent,
653            distinct_count: Precision::Absent,
654        }
655    }
656
657    /// Set the null count
658    pub fn with_null_count(mut self, null_count: Precision<usize>) -> Self {
659        self.null_count = null_count;
660        self
661    }
662
663    /// Set the max value
664    pub fn with_max_value(mut self, max_value: Precision<ScalarValue>) -> Self {
665        self.max_value = max_value;
666        self
667    }
668
669    /// Set the min value
670    pub fn with_min_value(mut self, min_value: Precision<ScalarValue>) -> Self {
671        self.min_value = min_value;
672        self
673    }
674
675    /// Set the sum value
676    pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
677        self.sum_value = sum_value;
678        self
679    }
680
681    /// Set the distinct count
682    pub fn with_distinct_count(mut self, distinct_count: Precision<usize>) -> Self {
683        self.distinct_count = distinct_count;
684        self
685    }
686
687    /// If the exactness of a [`ColumnStatistics`] instance is lost, this
688    /// function relaxes the exactness of all information by converting them
689    /// [`Precision::Inexact`].
690    pub fn to_inexact(mut self) -> Self {
691        self.null_count = self.null_count.to_inexact();
692        self.max_value = self.max_value.to_inexact();
693        self.min_value = self.min_value.to_inexact();
694        self.sum_value = self.sum_value.to_inexact();
695        self.distinct_count = self.distinct_count.to_inexact();
696        self
697    }
698}
699
700#[cfg(test)]
701mod tests {
702    use super::*;
703    use crate::assert_contains;
704    use arrow::datatypes::Field;
705    use std::sync::Arc;
706
707    #[test]
708    fn test_get_value() {
709        let exact_precision = Precision::Exact(42);
710        let inexact_precision = Precision::Inexact(23);
711        let absent_precision = Precision::<i32>::Absent;
712
713        assert_eq!(*exact_precision.get_value().unwrap(), 42);
714        assert_eq!(*inexact_precision.get_value().unwrap(), 23);
715        assert_eq!(absent_precision.get_value(), None);
716    }
717
718    #[test]
719    fn test_map() {
720        let exact_precision = Precision::Exact(42);
721        let inexact_precision = Precision::Inexact(23);
722        let absent_precision = Precision::Absent;
723
724        let squared = |x| x * x;
725
726        assert_eq!(exact_precision.map(squared), Precision::Exact(1764));
727        assert_eq!(inexact_precision.map(squared), Precision::Inexact(529));
728        assert_eq!(absent_precision.map(squared), Precision::Absent);
729    }
730
731    #[test]
732    fn test_is_exact() {
733        let exact_precision = Precision::Exact(42);
734        let inexact_precision = Precision::Inexact(23);
735        let absent_precision = Precision::<i32>::Absent;
736
737        assert_eq!(exact_precision.is_exact(), Some(true));
738        assert_eq!(inexact_precision.is_exact(), Some(false));
739        assert_eq!(absent_precision.is_exact(), None);
740    }
741
742    #[test]
743    fn test_max() {
744        let precision1 = Precision::Exact(42);
745        let precision2 = Precision::Inexact(23);
746        let precision3 = Precision::Exact(30);
747        let absent_precision = Precision::Absent;
748
749        assert_eq!(precision1.max(&precision2), Precision::Inexact(42));
750        assert_eq!(precision1.max(&precision3), Precision::Exact(42));
751        assert_eq!(precision2.max(&precision3), Precision::Inexact(30));
752        assert_eq!(precision1.max(&absent_precision), Precision::Absent);
753    }
754
755    #[test]
756    fn test_min() {
757        let precision1 = Precision::Exact(42);
758        let precision2 = Precision::Inexact(23);
759        let precision3 = Precision::Exact(30);
760        let absent_precision = Precision::Absent;
761
762        assert_eq!(precision1.min(&precision2), Precision::Inexact(23));
763        assert_eq!(precision1.min(&precision3), Precision::Exact(30));
764        assert_eq!(precision2.min(&precision3), Precision::Inexact(23));
765        assert_eq!(precision1.min(&absent_precision), Precision::Absent);
766    }
767
768    #[test]
769    fn test_to_inexact() {
770        let exact_precision = Precision::Exact(42);
771        let inexact_precision = Precision::Inexact(42);
772        let absent_precision = Precision::<i32>::Absent;
773
774        assert_eq!(exact_precision.to_inexact(), inexact_precision);
775        assert_eq!(inexact_precision.to_inexact(), inexact_precision);
776        assert_eq!(absent_precision.to_inexact(), absent_precision);
777    }
778
779    #[test]
780    fn test_add() {
781        let precision1 = Precision::Exact(42);
782        let precision2 = Precision::Inexact(23);
783        let precision3 = Precision::Exact(30);
784        let absent_precision = Precision::Absent;
785
786        assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
787        assert_eq!(precision1.add(&precision3), Precision::Exact(72));
788        assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
789        assert_eq!(precision1.add(&absent_precision), Precision::Absent);
790    }
791
792    #[test]
793    fn test_add_scalar() {
794        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
795
796        assert_eq!(
797            precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
798            Precision::Exact(ScalarValue::Int32(Some(65))),
799        );
800        assert_eq!(
801            precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
802            Precision::Inexact(ScalarValue::Int32(Some(65))),
803        );
804        assert_eq!(
805            precision.add(&Precision::Exact(ScalarValue::Int32(None))),
806            // As per behavior of ScalarValue::add
807            Precision::Exact(ScalarValue::Int32(None)),
808        );
809        assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
810    }
811
812    #[test]
813    fn test_sub() {
814        let precision1 = Precision::Exact(42);
815        let precision2 = Precision::Inexact(23);
816        let precision3 = Precision::Exact(30);
817        let absent_precision = Precision::Absent;
818
819        assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
820        assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
821        assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
822    }
823
824    #[test]
825    fn test_sub_scalar() {
826        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
827
828        assert_eq!(
829            precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
830            Precision::Exact(ScalarValue::Int32(Some(19))),
831        );
832        assert_eq!(
833            precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
834            Precision::Inexact(ScalarValue::Int32(Some(19))),
835        );
836        assert_eq!(
837            precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
838            // As per behavior of ScalarValue::sub
839            Precision::Exact(ScalarValue::Int32(None)),
840        );
841        assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
842    }
843
844    #[test]
845    fn test_multiply() {
846        let precision1 = Precision::Exact(6);
847        let precision2 = Precision::Inexact(3);
848        let precision3 = Precision::Exact(5);
849        let absent_precision = Precision::Absent;
850
851        assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
852        assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
853        assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
854        assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
855    }
856
857    #[test]
858    fn test_multiply_scalar() {
859        let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
860
861        assert_eq!(
862            precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
863            Precision::Exact(ScalarValue::Int32(Some(30))),
864        );
865        assert_eq!(
866            precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
867            Precision::Inexact(ScalarValue::Int32(Some(30))),
868        );
869        assert_eq!(
870            precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
871            // As per behavior of ScalarValue::mul_checked
872            Precision::Exact(ScalarValue::Int32(None)),
873        );
874        assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
875    }
876
877    #[test]
878    fn test_cast_to() {
879        // Valid
880        assert_eq!(
881            Precision::Exact(ScalarValue::Int32(Some(42)))
882                .cast_to(&DataType::Int64)
883                .unwrap(),
884            Precision::Exact(ScalarValue::Int64(Some(42))),
885        );
886        assert_eq!(
887            Precision::Inexact(ScalarValue::Int32(Some(42)))
888                .cast_to(&DataType::Int64)
889                .unwrap(),
890            Precision::Inexact(ScalarValue::Int64(Some(42))),
891        );
892        // Null
893        assert_eq!(
894            Precision::Exact(ScalarValue::Int32(None))
895                .cast_to(&DataType::Int64)
896                .unwrap(),
897            Precision::Exact(ScalarValue::Int64(None)),
898        );
899        // Overflow returns error
900        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
901            .cast_to(&DataType::Int8)
902            .is_err());
903    }
904
905    #[test]
906    fn test_precision_cloning() {
907        // Precision<usize> is copy
908        let precision: Precision<usize> = Precision::Exact(42);
909        let p2 = precision;
910        assert_eq!(precision, p2);
911
912        // Precision<ScalarValue> is not copy (requires .clone())
913        let precision: Precision<ScalarValue> =
914            Precision::Exact(ScalarValue::Int64(Some(42)));
915        // Clippy would complain about this if it were Copy
916        #[allow(clippy::redundant_clone)]
917        let p2 = precision.clone();
918        assert_eq!(precision, p2);
919    }
920
921    #[test]
922    fn test_project_none() {
923        let projection = None;
924        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
925        assert_eq!(stats, make_stats(vec![10, 20, 30]));
926    }
927
928    #[test]
929    fn test_project_empty() {
930        let projection = Some(vec![]);
931        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
932        assert_eq!(stats, make_stats(vec![]));
933    }
934
935    #[test]
936    fn test_project_swap() {
937        let projection = Some(vec![2, 1]);
938        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
939        assert_eq!(stats, make_stats(vec![30, 20]));
940    }
941
942    #[test]
943    fn test_project_repeated() {
944        let projection = Some(vec![1, 2, 1, 1, 0, 2]);
945        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
946        assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
947    }
948
949    // Make a Statistics structure with the specified null counts for each column
950    fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
951        Statistics {
952            num_rows: Precision::Exact(42),
953            total_byte_size: Precision::Exact(500),
954            column_statistics: counts.into_iter().map(col_stats_i64).collect(),
955        }
956    }
957
958    fn col_stats_i64(null_count: usize) -> ColumnStatistics {
959        ColumnStatistics {
960            null_count: Precision::Exact(null_count),
961            max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
962            min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
963            sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
964            distinct_count: Precision::Exact(100),
965        }
966    }
967
968    #[test]
969    fn test_try_merge_basic() {
970        // Create a schema with two columns
971        let schema = Arc::new(Schema::new(vec![
972            Field::new("col1", DataType::Int32, false),
973            Field::new("col2", DataType::Int32, false),
974        ]));
975
976        // Create items with statistics
977        let stats1 = Statistics {
978            num_rows: Precision::Exact(10),
979            total_byte_size: Precision::Exact(100),
980            column_statistics: vec![
981                ColumnStatistics {
982                    null_count: Precision::Exact(1),
983                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
984                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
985                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
986                    distinct_count: Precision::Absent,
987                },
988                ColumnStatistics {
989                    null_count: Precision::Exact(2),
990                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
991                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
992                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
993                    distinct_count: Precision::Absent,
994                },
995            ],
996        };
997
998        let stats2 = Statistics {
999            num_rows: Precision::Exact(15),
1000            total_byte_size: Precision::Exact(150),
1001            column_statistics: vec![
1002                ColumnStatistics {
1003                    null_count: Precision::Exact(2),
1004                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
1005                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1006                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
1007                    distinct_count: Precision::Absent,
1008                },
1009                ColumnStatistics {
1010                    null_count: Precision::Exact(3),
1011                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
1012                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
1013                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
1014                    distinct_count: Precision::Absent,
1015                },
1016            ],
1017        };
1018
1019        let items = vec![stats1, stats2];
1020
1021        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1022
1023        // Verify the results
1024        assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
1025        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 100 + 150
1026
1027        // Verify column statistics
1028        let col1_stats = &summary_stats.column_statistics[0];
1029        assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
1030        assert_eq!(
1031            col1_stats.max_value,
1032            Precision::Exact(ScalarValue::Int32(Some(120)))
1033        );
1034        assert_eq!(
1035            col1_stats.min_value,
1036            Precision::Exact(ScalarValue::Int32(Some(-10)))
1037        );
1038        assert_eq!(
1039            col1_stats.sum_value,
1040            Precision::Exact(ScalarValue::Int32(Some(1100)))
1041        ); // 500 + 600
1042
1043        let col2_stats = &summary_stats.column_statistics[1];
1044        assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
1045        assert_eq!(
1046            col2_stats.max_value,
1047            Precision::Exact(ScalarValue::Int32(Some(200)))
1048        );
1049        assert_eq!(
1050            col2_stats.min_value,
1051            Precision::Exact(ScalarValue::Int32(Some(5)))
1052        );
1053        assert_eq!(
1054            col2_stats.sum_value,
1055            Precision::Exact(ScalarValue::Int32(Some(2200)))
1056        ); // 1000 + 1200
1057    }
1058
1059    #[test]
1060    fn test_try_merge_mixed_precision() {
1061        // Create a schema with one column
1062        let schema = Arc::new(Schema::new(vec![Field::new(
1063            "col1",
1064            DataType::Int32,
1065            false,
1066        )]));
1067
1068        // Create items with different precision levels
1069        let stats1 = Statistics {
1070            num_rows: Precision::Exact(10),
1071            total_byte_size: Precision::Inexact(100),
1072            column_statistics: vec![ColumnStatistics {
1073                null_count: Precision::Exact(1),
1074                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1075                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1076                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1077                distinct_count: Precision::Absent,
1078            }],
1079        };
1080
1081        let stats2 = Statistics {
1082            num_rows: Precision::Inexact(15),
1083            total_byte_size: Precision::Exact(150),
1084            column_statistics: vec![ColumnStatistics {
1085                null_count: Precision::Inexact(2),
1086                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
1087                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1088                sum_value: Precision::Absent,
1089                distinct_count: Precision::Absent,
1090            }],
1091        };
1092
1093        let items = vec![stats1, stats2];
1094
1095        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1096
1097        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
1098        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
1099
1100        let col_stats = &summary_stats.column_statistics[0];
1101        assert_eq!(col_stats.null_count, Precision::Inexact(3));
1102        assert_eq!(
1103            col_stats.max_value,
1104            Precision::Inexact(ScalarValue::Int32(Some(120)))
1105        );
1106        assert_eq!(
1107            col_stats.min_value,
1108            Precision::Inexact(ScalarValue::Int32(Some(-10)))
1109        );
1110        assert!(matches!(col_stats.sum_value, Precision::Absent));
1111    }
1112
1113    #[test]
1114    fn test_try_merge_empty() {
1115        let schema = Arc::new(Schema::new(vec![Field::new(
1116            "col1",
1117            DataType::Int32,
1118            false,
1119        )]));
1120
1121        // Empty collection
1122        let items: Vec<Statistics> = vec![];
1123
1124        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1125
1126        // Verify default values for empty collection
1127        assert_eq!(summary_stats.num_rows, Precision::Absent);
1128        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
1129        assert_eq!(summary_stats.column_statistics.len(), 1);
1130        assert_eq!(
1131            summary_stats.column_statistics[0].null_count,
1132            Precision::Absent
1133        );
1134    }
1135
1136    #[test]
1137    fn test_try_merge_mismatched_size() {
1138        // Create a schema with one column
1139        let schema = Arc::new(Schema::new(vec![Field::new(
1140            "col1",
1141            DataType::Int32,
1142            false,
1143        )]));
1144
1145        // No column statistics
1146        let stats1 = Statistics::default();
1147
1148        let stats2 =
1149            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
1150
1151        let items = vec![stats1, stats2];
1152
1153        let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
1154        assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
1155    }
1156
1157    #[test]
1158    fn test_try_merge_distinct_count_absent() {
1159        // Create statistics with known distinct counts
1160        let stats1 = Statistics::default()
1161            .with_num_rows(Precision::Exact(10))
1162            .with_total_byte_size(Precision::Exact(100))
1163            .add_column_statistics(
1164                ColumnStatistics::new_unknown()
1165                    .with_null_count(Precision::Exact(0))
1166                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
1167                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
1168                    .with_distinct_count(Precision::Exact(5)),
1169            );
1170
1171        let stats2 = Statistics::default()
1172            .with_num_rows(Precision::Exact(15))
1173            .with_total_byte_size(Precision::Exact(150))
1174            .add_column_statistics(
1175                ColumnStatistics::new_unknown()
1176                    .with_null_count(Precision::Exact(0))
1177                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1178                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(20))))
1179                    .with_distinct_count(Precision::Exact(7)),
1180            );
1181
1182        // Merge statistics
1183        let merged_stats = stats1.try_merge(&stats2).unwrap();
1184
1185        // Verify the results
1186        assert_eq!(merged_stats.num_rows, Precision::Exact(25));
1187        assert_eq!(merged_stats.total_byte_size, Precision::Exact(250));
1188
1189        let col_stats = &merged_stats.column_statistics[0];
1190        assert_eq!(col_stats.null_count, Precision::Exact(0));
1191        assert_eq!(
1192            col_stats.min_value,
1193            Precision::Exact(ScalarValue::Int32(Some(1)))
1194        );
1195        assert_eq!(
1196            col_stats.max_value,
1197            Precision::Exact(ScalarValue::Int32(Some(20)))
1198        );
1199        // Distinct count should be Absent after merge
1200        assert_eq!(col_stats.distinct_count, Precision::Absent);
1201    }
1202}