datafusion_common/
stats.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! This module provides data structures to represent statistics
19
20use std::fmt::{self, Debug, Display};
21
22use crate::{Result, ScalarValue};
23
24use crate::error::_plan_err;
25use arrow::datatypes::{DataType, Schema, SchemaRef};
26
27/// Represents a value with a degree of certainty. `Precision` is used to
28/// propagate information the precision of statistical values.
29#[derive(Clone, PartialEq, Eq, Default, Copy)]
30pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
31    /// The exact value is known
32    Exact(T),
33    /// The value is not known exactly, but is likely close to this value
34    Inexact(T),
35    /// Nothing is known about the value
36    #[default]
37    Absent,
38}
39
40impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
41    /// If we have some value (exact or inexact), it returns that value.
42    /// Otherwise, it returns `None`.
43    pub fn get_value(&self) -> Option<&T> {
44        match self {
45            Precision::Exact(value) | Precision::Inexact(value) => Some(value),
46            Precision::Absent => None,
47        }
48    }
49
50    /// Transform the value in this [`Precision`] object, if one exists, using
51    /// the given function. Preserves the exactness state.
52    pub fn map<U, F>(self, f: F) -> Precision<U>
53    where
54        F: Fn(T) -> U,
55        U: Debug + Clone + PartialEq + Eq + PartialOrd,
56    {
57        match self {
58            Precision::Exact(val) => Precision::Exact(f(val)),
59            Precision::Inexact(val) => Precision::Inexact(f(val)),
60            _ => Precision::<U>::Absent,
61        }
62    }
63
64    /// Returns `Some(true)` if we have an exact value, `Some(false)` if we
65    /// have an inexact value, and `None` if there is no value.
66    pub fn is_exact(&self) -> Option<bool> {
67        match self {
68            Precision::Exact(_) => Some(true),
69            Precision::Inexact(_) => Some(false),
70            _ => None,
71        }
72    }
73
74    /// Returns the maximum of two (possibly inexact) values, conservatively
75    /// propagating exactness information. If one of the input values is
76    /// [`Precision::Absent`], the result is `Absent` too.
77    pub fn max(&self, other: &Precision<T>) -> Precision<T> {
78        match (self, other) {
79            (Precision::Exact(a), Precision::Exact(b)) => {
80                Precision::Exact(if a >= b { a.clone() } else { b.clone() })
81            }
82            (Precision::Inexact(a), Precision::Exact(b))
83            | (Precision::Exact(a), Precision::Inexact(b))
84            | (Precision::Inexact(a), Precision::Inexact(b)) => {
85                Precision::Inexact(if a >= b { a.clone() } else { b.clone() })
86            }
87            (_, _) => Precision::Absent,
88        }
89    }
90
91    /// Returns the minimum of two (possibly inexact) values, conservatively
92    /// propagating exactness information. If one of the input values is
93    /// [`Precision::Absent`], the result is `Absent` too.
94    pub fn min(&self, other: &Precision<T>) -> Precision<T> {
95        match (self, other) {
96            (Precision::Exact(a), Precision::Exact(b)) => {
97                Precision::Exact(if a >= b { b.clone() } else { a.clone() })
98            }
99            (Precision::Inexact(a), Precision::Exact(b))
100            | (Precision::Exact(a), Precision::Inexact(b))
101            | (Precision::Inexact(a), Precision::Inexact(b)) => {
102                Precision::Inexact(if a >= b { b.clone() } else { a.clone() })
103            }
104            (_, _) => Precision::Absent,
105        }
106    }
107
108    /// Demotes the precision state from exact to inexact (if present).
109    pub fn to_inexact(self) -> Self {
110        match self {
111            Precision::Exact(value) => Precision::Inexact(value),
112            _ => self,
113        }
114    }
115}
116
117impl Precision<usize> {
118    /// Calculates the sum of two (possibly inexact) [`usize`] values,
119    /// conservatively propagating exactness information. If one of the input
120    /// values is [`Precision::Absent`], the result is `Absent` too.
121    pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
122        match (self, other) {
123            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a + b),
124            (Precision::Inexact(a), Precision::Exact(b))
125            | (Precision::Exact(a), Precision::Inexact(b))
126            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a + b),
127            (_, _) => Precision::Absent,
128        }
129    }
130
131    /// Calculates the difference of two (possibly inexact) [`usize`] values,
132    /// conservatively propagating exactness information. If one of the input
133    /// values is [`Precision::Absent`], the result is `Absent` too.
134    pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
135        match (self, other) {
136            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a - b),
137            (Precision::Inexact(a), Precision::Exact(b))
138            | (Precision::Exact(a), Precision::Inexact(b))
139            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a - b),
140            (_, _) => Precision::Absent,
141        }
142    }
143
144    /// Calculates the multiplication of two (possibly inexact) [`usize`] values,
145    /// conservatively propagating exactness information. If one of the input
146    /// values is [`Precision::Absent`], the result is `Absent` too.
147    pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
148        match (self, other) {
149            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a * b),
150            (Precision::Inexact(a), Precision::Exact(b))
151            | (Precision::Exact(a), Precision::Inexact(b))
152            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a * b),
153            (_, _) => Precision::Absent,
154        }
155    }
156
157    /// Return the estimate of applying a filter with estimated selectivity
158    /// `selectivity` to this Precision. A selectivity of `1.0` means that all
159    /// rows are selected. A selectivity of `0.5` means half the rows are
160    /// selected. Will always return inexact statistics.
161    pub fn with_estimated_selectivity(self, selectivity: f64) -> Self {
162        self.map(|v| ((v as f64 * selectivity).ceil()) as usize)
163            .to_inexact()
164    }
165}
166
167impl Precision<ScalarValue> {
168    /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
169    /// conservatively propagating exactness information. If one of the input
170    /// values is [`Precision::Absent`], the result is `Absent` too.
171    pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
172        match (self, other) {
173            (Precision::Exact(a), Precision::Exact(b)) => {
174                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
175            }
176            (Precision::Inexact(a), Precision::Exact(b))
177            | (Precision::Exact(a), Precision::Inexact(b))
178            | (Precision::Inexact(a), Precision::Inexact(b)) => a
179                .add(b)
180                .map(Precision::Inexact)
181                .unwrap_or(Precision::Absent),
182            (_, _) => Precision::Absent,
183        }
184    }
185
186    /// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
187    /// conservatively propagating exactness information. If one of the input
188    /// values is [`Precision::Absent`], the result is `Absent` too.
189    pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
190        match (self, other) {
191            (Precision::Exact(a), Precision::Exact(b)) => {
192                a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
193            }
194            (Precision::Inexact(a), Precision::Exact(b))
195            | (Precision::Exact(a), Precision::Inexact(b))
196            | (Precision::Inexact(a), Precision::Inexact(b)) => a
197                .sub(b)
198                .map(Precision::Inexact)
199                .unwrap_or(Precision::Absent),
200            (_, _) => Precision::Absent,
201        }
202    }
203
204    /// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
205    /// conservatively propagating exactness information. If one of the input
206    /// values is [`Precision::Absent`], the result is `Absent` too.
207    pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
208        match (self, other) {
209            (Precision::Exact(a), Precision::Exact(b)) => a
210                .mul_checked(b)
211                .map(Precision::Exact)
212                .unwrap_or(Precision::Absent),
213            (Precision::Inexact(a), Precision::Exact(b))
214            | (Precision::Exact(a), Precision::Inexact(b))
215            | (Precision::Inexact(a), Precision::Inexact(b)) => a
216                .mul_checked(b)
217                .map(Precision::Inexact)
218                .unwrap_or(Precision::Absent),
219            (_, _) => Precision::Absent,
220        }
221    }
222
223    /// Casts the value to the given data type, propagating exactness information.
224    pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
225        match self {
226            Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
227            Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
228            Precision::Absent => Ok(Precision::Absent),
229        }
230    }
231}
232
233impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
234    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
235        match self {
236            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
237            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
238            Precision::Absent => write!(f, "Absent"),
239        }
240    }
241}
242
243impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
244    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
245        match self {
246            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
247            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
248            Precision::Absent => write!(f, "Absent"),
249        }
250    }
251}
252
253impl From<Precision<usize>> for Precision<ScalarValue> {
254    fn from(value: Precision<usize>) -> Self {
255        match value {
256            Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
257            Precision::Inexact(v) => {
258                Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
259            }
260            Precision::Absent => Precision::Absent,
261        }
262    }
263}
264
265/// Statistics for a relation
266/// Fields are optional and can be inexact because the sources
267/// sometimes provide approximate estimates for performance reasons
268/// and the transformations output are not always predictable.
269#[derive(Debug, Clone, PartialEq, Eq)]
270pub struct Statistics {
271    /// The number of table rows.
272    pub num_rows: Precision<usize>,
273    /// Total bytes of the table rows.
274    pub total_byte_size: Precision<usize>,
275    /// Statistics on a column level.
276    ///
277    /// It must contains a [`ColumnStatistics`] for each field in the schema of
278    /// the table to which the [`Statistics`] refer.
279    pub column_statistics: Vec<ColumnStatistics>,
280}
281
282impl Default for Statistics {
283    /// Returns a new [`Statistics`] instance with all fields set to unknown
284    /// and no columns.
285    fn default() -> Self {
286        Self {
287            num_rows: Precision::Absent,
288            total_byte_size: Precision::Absent,
289            column_statistics: vec![],
290        }
291    }
292}
293
294impl Statistics {
295    /// Returns a [`Statistics`] instance for the given schema by assigning
296    /// unknown statistics to each column in the schema.
297    pub fn new_unknown(schema: &Schema) -> Self {
298        Self {
299            num_rows: Precision::Absent,
300            total_byte_size: Precision::Absent,
301            column_statistics: Statistics::unknown_column(schema),
302        }
303    }
304
305    /// Returns an unbounded `ColumnStatistics` for each field in the schema.
306    pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
307        schema
308            .fields()
309            .iter()
310            .map(|_| ColumnStatistics::new_unknown())
311            .collect()
312    }
313
314    /// Set the number of rows
315    pub fn with_num_rows(mut self, num_rows: Precision<usize>) -> Self {
316        self.num_rows = num_rows;
317        self
318    }
319
320    /// Set the total size, in bytes
321    pub fn with_total_byte_size(mut self, total_byte_size: Precision<usize>) -> Self {
322        self.total_byte_size = total_byte_size;
323        self
324    }
325
326    /// Add a column to the column statistics
327    pub fn add_column_statistics(mut self, column_stats: ColumnStatistics) -> Self {
328        self.column_statistics.push(column_stats);
329        self
330    }
331
332    /// If the exactness of a [`Statistics`] instance is lost, this function relaxes
333    /// the exactness of all information by converting them [`Precision::Inexact`].
334    pub fn to_inexact(mut self) -> Self {
335        self.num_rows = self.num_rows.to_inexact();
336        self.total_byte_size = self.total_byte_size.to_inexact();
337        self.column_statistics = self
338            .column_statistics
339            .into_iter()
340            .map(|s| s.to_inexact())
341            .collect();
342        self
343    }
344
345    /// Project the statistics to the given column indices.
346    ///
347    /// For example, if we had statistics for columns `{"a", "b", "c"}`,
348    /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
349    /// "b"}`.
350    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
351        let Some(projection) = projection else {
352            return self;
353        };
354
355        #[allow(clippy::large_enum_variant)]
356        enum Slot {
357            /// The column is taken and put into the specified statistics location
358            Taken(usize),
359            /// The original columns is present
360            Present(ColumnStatistics),
361        }
362
363        // Convert to Vec<Slot> so we can avoid copying the statistics
364        let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
365            .into_iter()
366            .map(Slot::Present)
367            .collect();
368
369        for idx in projection {
370            let next_idx = self.column_statistics.len();
371            let slot = std::mem::replace(
372                columns.get_mut(*idx).expect("projection out of bounds"),
373                Slot::Taken(next_idx),
374            );
375            match slot {
376                // The column was there, so just move it
377                Slot::Present(col) => self.column_statistics.push(col),
378                // The column was taken, so copy from the previous location
379                Slot::Taken(prev_idx) => self
380                    .column_statistics
381                    .push(self.column_statistics[prev_idx].clone()),
382            }
383        }
384
385        self
386    }
387
388    /// Calculates the statistics after applying `fetch` and `skip` operations.
389    ///
390    /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
391    /// parameter to compute global statistics in a multi-partition setting.
392    pub fn with_fetch(
393        mut self,
394        schema: SchemaRef,
395        fetch: Option<usize>,
396        skip: usize,
397        n_partitions: usize,
398    ) -> Result<Self> {
399        let fetch_val = fetch.unwrap_or(usize::MAX);
400
401        self.num_rows = match self {
402            Statistics {
403                num_rows: Precision::Exact(nr),
404                ..
405            }
406            | Statistics {
407                num_rows: Precision::Inexact(nr),
408                ..
409            } => {
410                // Here, the inexact case gives us an upper bound on the number of rows.
411                if nr <= skip {
412                    // All input data will be skipped:
413                    Precision::Exact(0)
414                } else if nr <= fetch_val && skip == 0 {
415                    // If the input does not reach the `fetch` globally, and `skip`
416                    // is zero (meaning the input and output are identical), return
417                    // input stats as is.
418                    // TODO: Can input stats still be used, but adjusted, when `skip`
419                    //       is non-zero?
420                    return Ok(self);
421                } else if nr - skip <= fetch_val {
422                    // After `skip` input rows are skipped, the remaining rows are
423                    // less than or equal to the `fetch` values, so `num_rows` must
424                    // equal the remaining rows.
425                    check_num_rows(
426                        (nr - skip).checked_mul(n_partitions),
427                        // We know that we have an estimate for the number of rows:
428                        self.num_rows.is_exact().unwrap(),
429                    )
430                } else {
431                    // At this point we know that we were given a `fetch` value
432                    // as the `None` case would go into the branch above. Since
433                    // the input has more rows than `fetch + skip`, the number
434                    // of rows will be the `fetch`, but we won't be able to
435                    // predict the other statistics.
436                    check_num_rows(
437                        fetch_val.checked_mul(n_partitions),
438                        // We know that we have an estimate for the number of rows:
439                        self.num_rows.is_exact().unwrap(),
440                    )
441                }
442            }
443            Statistics {
444                num_rows: Precision::Absent,
445                ..
446            } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
447        };
448        self.column_statistics = Statistics::unknown_column(&schema);
449        self.total_byte_size = Precision::Absent;
450        Ok(self)
451    }
452
453    /// Summarize zero or more statistics into a single `Statistics` instance.
454    ///
455    /// The method assumes that all statistics are for the same schema.
456    /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
457    ///
458    /// Returns an error if the statistics do not match the specified schemas.
459    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
460    where
461        I: IntoIterator<Item = &'a Statistics>,
462    {
463        let mut items = items.into_iter();
464
465        let Some(init) = items.next() else {
466            return Ok(Statistics::new_unknown(schema));
467        };
468        items.try_fold(init.clone(), |acc: Statistics, item_stats: &Statistics| {
469            acc.try_merge(item_stats)
470        })
471    }
472
473    /// Merge this Statistics value with another Statistics value.
474    ///
475    /// Returns an error if the statistics do not match (different schemas).
476    ///
477    /// # Example
478    /// ```
479    /// # use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
480    /// # use arrow::datatypes::{Field, Schema, DataType};
481    /// # use datafusion_common::stats::Precision;
482    /// let stats1 = Statistics::default()
483    ///   .with_num_rows(Precision::Exact(1))
484    ///   .with_total_byte_size(Precision::Exact(2))
485    ///   .add_column_statistics(ColumnStatistics::new_unknown()
486    ///      .with_null_count(Precision::Exact(3))
487    ///      .with_min_value(Precision::Exact(ScalarValue::from(4)))
488    ///      .with_max_value(Precision::Exact(ScalarValue::from(5)))
489    ///   );
490    ///
491    /// let stats2 = Statistics::default()
492    ///   .with_num_rows(Precision::Exact(10))
493    ///   .with_total_byte_size(Precision::Inexact(20))
494    ///   .add_column_statistics(ColumnStatistics::new_unknown()
495    ///       // absent null count
496    ///      .with_min_value(Precision::Exact(ScalarValue::from(40)))
497    ///      .with_max_value(Precision::Exact(ScalarValue::from(50)))
498    ///   );
499    ///
500    /// let merged_stats = stats1.try_merge(&stats2).unwrap();
501    /// let expected_stats = Statistics::default()
502    ///   .with_num_rows(Precision::Exact(11))
503    ///   .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact
504    ///   .add_column_statistics(
505    ///     ColumnStatistics::new_unknown()
506    ///       .with_null_count(Precision::Absent) // missing from stats2 --> absent
507    ///       .with_min_value(Precision::Exact(ScalarValue::from(4)))
508    ///       .with_max_value(Precision::Exact(ScalarValue::from(50)))
509    ///   );
510    ///
511    /// assert_eq!(merged_stats, expected_stats)
512    /// ```
513    pub fn try_merge(self, other: &Statistics) -> Result<Self> {
514        let Self {
515            mut num_rows,
516            mut total_byte_size,
517            mut column_statistics,
518        } = self;
519
520        // Accumulate statistics for subsequent items
521        num_rows = num_rows.add(&other.num_rows);
522        total_byte_size = total_byte_size.add(&other.total_byte_size);
523
524        if column_statistics.len() != other.column_statistics.len() {
525            return _plan_err!(
526                "Cannot merge statistics with different number of columns: {} vs {}",
527                column_statistics.len(),
528                other.column_statistics.len()
529            );
530        }
531
532        for (item_col_stats, col_stats) in other
533            .column_statistics
534            .iter()
535            .zip(column_statistics.iter_mut())
536        {
537            col_stats.null_count = col_stats.null_count.add(&item_col_stats.null_count);
538            col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
539            col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
540            col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
541        }
542
543        Ok(Statistics {
544            num_rows,
545            total_byte_size,
546            column_statistics,
547        })
548    }
549}
550
551/// Creates an estimate of the number of rows in the output using the given
552/// optional value and exactness flag.
553fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
554    if let Some(value) = value {
555        if is_exact {
556            Precision::Exact(value)
557        } else {
558            // If the input stats are inexact, so are the output stats.
559            Precision::Inexact(value)
560        }
561    } else {
562        // If the estimate is not available (e.g. due to an overflow), we can
563        // not produce a reliable estimate.
564        Precision::Absent
565    }
566}
567
568impl Display for Statistics {
569    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
570        // string of column statistics
571        let column_stats = self
572            .column_statistics
573            .iter()
574            .enumerate()
575            .map(|(i, cs)| {
576                let s = format!("(Col[{i}]:");
577                let s = if cs.min_value != Precision::Absent {
578                    format!("{} Min={}", s, cs.min_value)
579                } else {
580                    s
581                };
582                let s = if cs.max_value != Precision::Absent {
583                    format!("{} Max={}", s, cs.max_value)
584                } else {
585                    s
586                };
587                let s = if cs.sum_value != Precision::Absent {
588                    format!("{} Sum={}", s, cs.sum_value)
589                } else {
590                    s
591                };
592                let s = if cs.null_count != Precision::Absent {
593                    format!("{} Null={}", s, cs.null_count)
594                } else {
595                    s
596                };
597                let s = if cs.distinct_count != Precision::Absent {
598                    format!("{} Distinct={}", s, cs.distinct_count)
599                } else {
600                    s
601                };
602
603                s + ")"
604            })
605            .collect::<Vec<_>>()
606            .join(",");
607
608        write!(
609            f,
610            "Rows={}, Bytes={}, [{}]",
611            self.num_rows, self.total_byte_size, column_stats
612        )?;
613
614        Ok(())
615    }
616}
617
618/// Statistics for a column within a relation
619#[derive(Clone, Debug, PartialEq, Eq, Default)]
620pub struct ColumnStatistics {
621    /// Number of null values on column
622    pub null_count: Precision<usize>,
623    /// Maximum value of column
624    pub max_value: Precision<ScalarValue>,
625    /// Minimum value of column
626    pub min_value: Precision<ScalarValue>,
627    /// Sum value of a column
628    pub sum_value: Precision<ScalarValue>,
629    /// Number of distinct values
630    pub distinct_count: Precision<usize>,
631}
632
633impl ColumnStatistics {
634    /// Column contains a single non null value (e.g constant).
635    pub fn is_singleton(&self) -> bool {
636        match (&self.min_value, &self.max_value) {
637            // Min and max values are the same and not infinity.
638            (Precision::Exact(min), Precision::Exact(max)) => {
639                !min.is_null() && !max.is_null() && (min == max)
640            }
641            (_, _) => false,
642        }
643    }
644
645    /// Returns a [`ColumnStatistics`] instance having all [`Precision::Absent`] parameters.
646    pub fn new_unknown() -> Self {
647        Self {
648            null_count: Precision::Absent,
649            max_value: Precision::Absent,
650            min_value: Precision::Absent,
651            sum_value: Precision::Absent,
652            distinct_count: Precision::Absent,
653        }
654    }
655
656    /// Set the null count
657    pub fn with_null_count(mut self, null_count: Precision<usize>) -> Self {
658        self.null_count = null_count;
659        self
660    }
661
662    /// Set the max value
663    pub fn with_max_value(mut self, max_value: Precision<ScalarValue>) -> Self {
664        self.max_value = max_value;
665        self
666    }
667
668    /// Set the min value
669    pub fn with_min_value(mut self, min_value: Precision<ScalarValue>) -> Self {
670        self.min_value = min_value;
671        self
672    }
673
674    /// Set the sum value
675    pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
676        self.sum_value = sum_value;
677        self
678    }
679
680    /// Set the distinct count
681    pub fn with_distinct_count(mut self, distinct_count: Precision<usize>) -> Self {
682        self.distinct_count = distinct_count;
683        self
684    }
685
686    /// If the exactness of a [`ColumnStatistics`] instance is lost, this
687    /// function relaxes the exactness of all information by converting them
688    /// [`Precision::Inexact`].
689    pub fn to_inexact(mut self) -> Self {
690        self.null_count = self.null_count.to_inexact();
691        self.max_value = self.max_value.to_inexact();
692        self.min_value = self.min_value.to_inexact();
693        self.sum_value = self.sum_value.to_inexact();
694        self.distinct_count = self.distinct_count.to_inexact();
695        self
696    }
697}
698
699#[cfg(test)]
700mod tests {
701    use super::*;
702    use crate::assert_contains;
703    use arrow::datatypes::Field;
704    use std::sync::Arc;
705
706    #[test]
707    fn test_get_value() {
708        let exact_precision = Precision::Exact(42);
709        let inexact_precision = Precision::Inexact(23);
710        let absent_precision = Precision::<i32>::Absent;
711
712        assert_eq!(*exact_precision.get_value().unwrap(), 42);
713        assert_eq!(*inexact_precision.get_value().unwrap(), 23);
714        assert_eq!(absent_precision.get_value(), None);
715    }
716
717    #[test]
718    fn test_map() {
719        let exact_precision = Precision::Exact(42);
720        let inexact_precision = Precision::Inexact(23);
721        let absent_precision = Precision::Absent;
722
723        let squared = |x| x * x;
724
725        assert_eq!(exact_precision.map(squared), Precision::Exact(1764));
726        assert_eq!(inexact_precision.map(squared), Precision::Inexact(529));
727        assert_eq!(absent_precision.map(squared), Precision::Absent);
728    }
729
730    #[test]
731    fn test_is_exact() {
732        let exact_precision = Precision::Exact(42);
733        let inexact_precision = Precision::Inexact(23);
734        let absent_precision = Precision::<i32>::Absent;
735
736        assert_eq!(exact_precision.is_exact(), Some(true));
737        assert_eq!(inexact_precision.is_exact(), Some(false));
738        assert_eq!(absent_precision.is_exact(), None);
739    }
740
741    #[test]
742    fn test_max() {
743        let precision1 = Precision::Exact(42);
744        let precision2 = Precision::Inexact(23);
745        let precision3 = Precision::Exact(30);
746        let absent_precision = Precision::Absent;
747
748        assert_eq!(precision1.max(&precision2), Precision::Inexact(42));
749        assert_eq!(precision1.max(&precision3), Precision::Exact(42));
750        assert_eq!(precision2.max(&precision3), Precision::Inexact(30));
751        assert_eq!(precision1.max(&absent_precision), Precision::Absent);
752    }
753
754    #[test]
755    fn test_min() {
756        let precision1 = Precision::Exact(42);
757        let precision2 = Precision::Inexact(23);
758        let precision3 = Precision::Exact(30);
759        let absent_precision = Precision::Absent;
760
761        assert_eq!(precision1.min(&precision2), Precision::Inexact(23));
762        assert_eq!(precision1.min(&precision3), Precision::Exact(30));
763        assert_eq!(precision2.min(&precision3), Precision::Inexact(23));
764        assert_eq!(precision1.min(&absent_precision), Precision::Absent);
765    }
766
767    #[test]
768    fn test_to_inexact() {
769        let exact_precision = Precision::Exact(42);
770        let inexact_precision = Precision::Inexact(42);
771        let absent_precision = Precision::<i32>::Absent;
772
773        assert_eq!(exact_precision.to_inexact(), inexact_precision);
774        assert_eq!(inexact_precision.to_inexact(), inexact_precision);
775        assert_eq!(absent_precision.to_inexact(), absent_precision);
776    }
777
778    #[test]
779    fn test_add() {
780        let precision1 = Precision::Exact(42);
781        let precision2 = Precision::Inexact(23);
782        let precision3 = Precision::Exact(30);
783        let absent_precision = Precision::Absent;
784
785        assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
786        assert_eq!(precision1.add(&precision3), Precision::Exact(72));
787        assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
788        assert_eq!(precision1.add(&absent_precision), Precision::Absent);
789    }
790
791    #[test]
792    fn test_add_scalar() {
793        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
794
795        assert_eq!(
796            precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
797            Precision::Exact(ScalarValue::Int32(Some(65))),
798        );
799        assert_eq!(
800            precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
801            Precision::Inexact(ScalarValue::Int32(Some(65))),
802        );
803        assert_eq!(
804            precision.add(&Precision::Exact(ScalarValue::Int32(None))),
805            // As per behavior of ScalarValue::add
806            Precision::Exact(ScalarValue::Int32(None)),
807        );
808        assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
809    }
810
811    #[test]
812    fn test_sub() {
813        let precision1 = Precision::Exact(42);
814        let precision2 = Precision::Inexact(23);
815        let precision3 = Precision::Exact(30);
816        let absent_precision = Precision::Absent;
817
818        assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
819        assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
820        assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
821    }
822
823    #[test]
824    fn test_sub_scalar() {
825        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
826
827        assert_eq!(
828            precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
829            Precision::Exact(ScalarValue::Int32(Some(19))),
830        );
831        assert_eq!(
832            precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
833            Precision::Inexact(ScalarValue::Int32(Some(19))),
834        );
835        assert_eq!(
836            precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
837            // As per behavior of ScalarValue::sub
838            Precision::Exact(ScalarValue::Int32(None)),
839        );
840        assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
841    }
842
843    #[test]
844    fn test_multiply() {
845        let precision1 = Precision::Exact(6);
846        let precision2 = Precision::Inexact(3);
847        let precision3 = Precision::Exact(5);
848        let absent_precision = Precision::Absent;
849
850        assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
851        assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
852        assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
853        assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
854    }
855
856    #[test]
857    fn test_multiply_scalar() {
858        let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
859
860        assert_eq!(
861            precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
862            Precision::Exact(ScalarValue::Int32(Some(30))),
863        );
864        assert_eq!(
865            precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
866            Precision::Inexact(ScalarValue::Int32(Some(30))),
867        );
868        assert_eq!(
869            precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
870            // As per behavior of ScalarValue::mul_checked
871            Precision::Exact(ScalarValue::Int32(None)),
872        );
873        assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
874    }
875
876    #[test]
877    fn test_cast_to() {
878        // Valid
879        assert_eq!(
880            Precision::Exact(ScalarValue::Int32(Some(42)))
881                .cast_to(&DataType::Int64)
882                .unwrap(),
883            Precision::Exact(ScalarValue::Int64(Some(42))),
884        );
885        assert_eq!(
886            Precision::Inexact(ScalarValue::Int32(Some(42)))
887                .cast_to(&DataType::Int64)
888                .unwrap(),
889            Precision::Inexact(ScalarValue::Int64(Some(42))),
890        );
891        // Null
892        assert_eq!(
893            Precision::Exact(ScalarValue::Int32(None))
894                .cast_to(&DataType::Int64)
895                .unwrap(),
896            Precision::Exact(ScalarValue::Int64(None)),
897        );
898        // Overflow returns error
899        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
900            .cast_to(&DataType::Int8)
901            .is_err());
902    }
903
904    #[test]
905    fn test_precision_cloning() {
906        // Precision<usize> is copy
907        let precision: Precision<usize> = Precision::Exact(42);
908        let p2 = precision;
909        assert_eq!(precision, p2);
910
911        // Precision<ScalarValue> is not copy (requires .clone())
912        let precision: Precision<ScalarValue> =
913            Precision::Exact(ScalarValue::Int64(Some(42)));
914        // Clippy would complain about this if it were Copy
915        #[allow(clippy::redundant_clone)]
916        let p2 = precision.clone();
917        assert_eq!(precision, p2);
918    }
919
920    #[test]
921    fn test_project_none() {
922        let projection = None;
923        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
924        assert_eq!(stats, make_stats(vec![10, 20, 30]));
925    }
926
927    #[test]
928    fn test_project_empty() {
929        let projection = Some(vec![]);
930        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
931        assert_eq!(stats, make_stats(vec![]));
932    }
933
934    #[test]
935    fn test_project_swap() {
936        let projection = Some(vec![2, 1]);
937        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
938        assert_eq!(stats, make_stats(vec![30, 20]));
939    }
940
941    #[test]
942    fn test_project_repeated() {
943        let projection = Some(vec![1, 2, 1, 1, 0, 2]);
944        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
945        assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
946    }
947
948    // Make a Statistics structure with the specified null counts for each column
949    fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
950        Statistics {
951            num_rows: Precision::Exact(42),
952            total_byte_size: Precision::Exact(500),
953            column_statistics: counts.into_iter().map(col_stats_i64).collect(),
954        }
955    }
956
957    fn col_stats_i64(null_count: usize) -> ColumnStatistics {
958        ColumnStatistics {
959            null_count: Precision::Exact(null_count),
960            max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
961            min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
962            sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
963            distinct_count: Precision::Exact(100),
964        }
965    }
966
967    #[test]
968    fn test_try_merge_basic() {
969        // Create a schema with two columns
970        let schema = Arc::new(Schema::new(vec![
971            Field::new("col1", DataType::Int32, false),
972            Field::new("col2", DataType::Int32, false),
973        ]));
974
975        // Create items with statistics
976        let stats1 = Statistics {
977            num_rows: Precision::Exact(10),
978            total_byte_size: Precision::Exact(100),
979            column_statistics: vec![
980                ColumnStatistics {
981                    null_count: Precision::Exact(1),
982                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
983                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
984                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
985                    distinct_count: Precision::Absent,
986                },
987                ColumnStatistics {
988                    null_count: Precision::Exact(2),
989                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
990                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
991                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
992                    distinct_count: Precision::Absent,
993                },
994            ],
995        };
996
997        let stats2 = Statistics {
998            num_rows: Precision::Exact(15),
999            total_byte_size: Precision::Exact(150),
1000            column_statistics: vec![
1001                ColumnStatistics {
1002                    null_count: Precision::Exact(2),
1003                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
1004                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1005                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
1006                    distinct_count: Precision::Absent,
1007                },
1008                ColumnStatistics {
1009                    null_count: Precision::Exact(3),
1010                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
1011                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
1012                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
1013                    distinct_count: Precision::Absent,
1014                },
1015            ],
1016        };
1017
1018        let items = vec![stats1, stats2];
1019
1020        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1021
1022        // Verify the results
1023        assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
1024        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 100 + 150
1025
1026        // Verify column statistics
1027        let col1_stats = &summary_stats.column_statistics[0];
1028        assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
1029        assert_eq!(
1030            col1_stats.max_value,
1031            Precision::Exact(ScalarValue::Int32(Some(120)))
1032        );
1033        assert_eq!(
1034            col1_stats.min_value,
1035            Precision::Exact(ScalarValue::Int32(Some(-10)))
1036        );
1037        assert_eq!(
1038            col1_stats.sum_value,
1039            Precision::Exact(ScalarValue::Int32(Some(1100)))
1040        ); // 500 + 600
1041
1042        let col2_stats = &summary_stats.column_statistics[1];
1043        assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
1044        assert_eq!(
1045            col2_stats.max_value,
1046            Precision::Exact(ScalarValue::Int32(Some(200)))
1047        );
1048        assert_eq!(
1049            col2_stats.min_value,
1050            Precision::Exact(ScalarValue::Int32(Some(5)))
1051        );
1052        assert_eq!(
1053            col2_stats.sum_value,
1054            Precision::Exact(ScalarValue::Int32(Some(2200)))
1055        ); // 1000 + 1200
1056    }
1057
1058    #[test]
1059    fn test_try_merge_mixed_precision() {
1060        // Create a schema with one column
1061        let schema = Arc::new(Schema::new(vec![Field::new(
1062            "col1",
1063            DataType::Int32,
1064            false,
1065        )]));
1066
1067        // Create items with different precision levels
1068        let stats1 = Statistics {
1069            num_rows: Precision::Exact(10),
1070            total_byte_size: Precision::Inexact(100),
1071            column_statistics: vec![ColumnStatistics {
1072                null_count: Precision::Exact(1),
1073                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1074                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1075                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1076                distinct_count: Precision::Absent,
1077            }],
1078        };
1079
1080        let stats2 = Statistics {
1081            num_rows: Precision::Inexact(15),
1082            total_byte_size: Precision::Exact(150),
1083            column_statistics: vec![ColumnStatistics {
1084                null_count: Precision::Inexact(2),
1085                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
1086                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1087                sum_value: Precision::Absent,
1088                distinct_count: Precision::Absent,
1089            }],
1090        };
1091
1092        let items = vec![stats1, stats2];
1093
1094        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1095
1096        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
1097        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
1098
1099        let col_stats = &summary_stats.column_statistics[0];
1100        assert_eq!(col_stats.null_count, Precision::Inexact(3));
1101        assert_eq!(
1102            col_stats.max_value,
1103            Precision::Inexact(ScalarValue::Int32(Some(120)))
1104        );
1105        assert_eq!(
1106            col_stats.min_value,
1107            Precision::Inexact(ScalarValue::Int32(Some(-10)))
1108        );
1109        assert!(matches!(col_stats.sum_value, Precision::Absent));
1110    }
1111
1112    #[test]
1113    fn test_try_merge_empty() {
1114        let schema = Arc::new(Schema::new(vec![Field::new(
1115            "col1",
1116            DataType::Int32,
1117            false,
1118        )]));
1119
1120        // Empty collection
1121        let items: Vec<Statistics> = vec![];
1122
1123        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1124
1125        // Verify default values for empty collection
1126        assert_eq!(summary_stats.num_rows, Precision::Absent);
1127        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
1128        assert_eq!(summary_stats.column_statistics.len(), 1);
1129        assert_eq!(
1130            summary_stats.column_statistics[0].null_count,
1131            Precision::Absent
1132        );
1133    }
1134
1135    #[test]
1136    fn test_try_merge_mismatched_size() {
1137        // Create a schema with one column
1138        let schema = Arc::new(Schema::new(vec![Field::new(
1139            "col1",
1140            DataType::Int32,
1141            false,
1142        )]));
1143
1144        // No column statistics
1145        let stats1 = Statistics::default();
1146
1147        let stats2 =
1148            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
1149
1150        let items = vec![stats1, stats2];
1151
1152        let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
1153        assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
1154    }
1155}