datafusion_common/
stats.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! This module provides data structures to represent statistics
19
20use std::fmt::{self, Debug, Display};
21
22use crate::{Result, ScalarValue};
23
24use crate::error::_plan_err;
25use arrow::datatypes::{DataType, Schema};
26
27/// Represents a value with a degree of certainty. `Precision` is used to
28/// propagate information the precision of statistical values.
29#[derive(Clone, PartialEq, Eq, Default, Copy)]
30pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
31    /// The exact value is known
32    Exact(T),
33    /// The value is not known exactly, but is likely close to this value
34    Inexact(T),
35    /// Nothing is known about the value
36    #[default]
37    Absent,
38}
39
40impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
41    /// If we have some value (exact or inexact), it returns that value.
42    /// Otherwise, it returns `None`.
43    pub fn get_value(&self) -> Option<&T> {
44        match self {
45            Precision::Exact(value) | Precision::Inexact(value) => Some(value),
46            Precision::Absent => None,
47        }
48    }
49
50    /// Transform the value in this [`Precision`] object, if one exists, using
51    /// the given function. Preserves the exactness state.
52    pub fn map<U, F>(self, f: F) -> Precision<U>
53    where
54        F: Fn(T) -> U,
55        U: Debug + Clone + PartialEq + Eq + PartialOrd,
56    {
57        match self {
58            Precision::Exact(val) => Precision::Exact(f(val)),
59            Precision::Inexact(val) => Precision::Inexact(f(val)),
60            _ => Precision::<U>::Absent,
61        }
62    }
63
64    /// Returns `Some(true)` if we have an exact value, `Some(false)` if we
65    /// have an inexact value, and `None` if there is no value.
66    pub fn is_exact(&self) -> Option<bool> {
67        match self {
68            Precision::Exact(_) => Some(true),
69            Precision::Inexact(_) => Some(false),
70            _ => None,
71        }
72    }
73
74    /// Returns the maximum of two (possibly inexact) values, conservatively
75    /// propagating exactness information. If one of the input values is
76    /// [`Precision::Absent`], the result is `Absent` too.
77    pub fn max(&self, other: &Precision<T>) -> Precision<T> {
78        match (self, other) {
79            (Precision::Exact(a), Precision::Exact(b)) => {
80                Precision::Exact(if a >= b { a.clone() } else { b.clone() })
81            }
82            (Precision::Inexact(a), Precision::Exact(b))
83            | (Precision::Exact(a), Precision::Inexact(b))
84            | (Precision::Inexact(a), Precision::Inexact(b)) => {
85                Precision::Inexact(if a >= b { a.clone() } else { b.clone() })
86            }
87            (_, _) => Precision::Absent,
88        }
89    }
90
91    /// Returns the minimum of two (possibly inexact) values, conservatively
92    /// propagating exactness information. If one of the input values is
93    /// [`Precision::Absent`], the result is `Absent` too.
94    pub fn min(&self, other: &Precision<T>) -> Precision<T> {
95        match (self, other) {
96            (Precision::Exact(a), Precision::Exact(b)) => {
97                Precision::Exact(if a >= b { b.clone() } else { a.clone() })
98            }
99            (Precision::Inexact(a), Precision::Exact(b))
100            | (Precision::Exact(a), Precision::Inexact(b))
101            | (Precision::Inexact(a), Precision::Inexact(b)) => {
102                Precision::Inexact(if a >= b { b.clone() } else { a.clone() })
103            }
104            (_, _) => Precision::Absent,
105        }
106    }
107
108    /// Demotes the precision state from exact to inexact (if present).
109    pub fn to_inexact(self) -> Self {
110        match self {
111            Precision::Exact(value) => Precision::Inexact(value),
112            _ => self,
113        }
114    }
115}
116
117impl Precision<usize> {
118    /// Calculates the sum of two (possibly inexact) [`usize`] values,
119    /// conservatively propagating exactness information. If one of the input
120    /// values is [`Precision::Absent`], the result is `Absent` too.
121    pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
122        match (self, other) {
123            (Precision::Exact(a), Precision::Exact(b)) => a.checked_add(*b).map_or_else(
124                || Precision::Inexact(a.saturating_add(*b)),
125                Precision::Exact,
126            ),
127            (Precision::Inexact(a), Precision::Exact(b))
128            | (Precision::Exact(a), Precision::Inexact(b))
129            | (Precision::Inexact(a), Precision::Inexact(b)) => {
130                Precision::Inexact(a.saturating_add(*b))
131            }
132            (_, _) => Precision::Absent,
133        }
134    }
135
136    /// Calculates the difference of two (possibly inexact) [`usize`] values,
137    /// conservatively propagating exactness information. If one of the input
138    /// values is [`Precision::Absent`], the result is `Absent` too.
139    pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
140        match (self, other) {
141            (Precision::Exact(a), Precision::Exact(b)) => a.checked_sub(*b).map_or_else(
142                || Precision::Inexact(a.saturating_sub(*b)),
143                Precision::Exact,
144            ),
145            (Precision::Inexact(a), Precision::Exact(b))
146            | (Precision::Exact(a), Precision::Inexact(b))
147            | (Precision::Inexact(a), Precision::Inexact(b)) => {
148                Precision::Inexact(a.saturating_sub(*b))
149            }
150            (_, _) => Precision::Absent,
151        }
152    }
153
154    /// Calculates the multiplication of two (possibly inexact) [`usize`] values,
155    /// conservatively propagating exactness information. If one of the input
156    /// values is [`Precision::Absent`], the result is `Absent` too.
157    pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
158        match (self, other) {
159            (Precision::Exact(a), Precision::Exact(b)) => a.checked_mul(*b).map_or_else(
160                || Precision::Inexact(a.saturating_mul(*b)),
161                Precision::Exact,
162            ),
163            (Precision::Inexact(a), Precision::Exact(b))
164            | (Precision::Exact(a), Precision::Inexact(b))
165            | (Precision::Inexact(a), Precision::Inexact(b)) => {
166                Precision::Inexact(a.saturating_mul(*b))
167            }
168            (_, _) => Precision::Absent,
169        }
170    }
171
172    /// Return the estimate of applying a filter with estimated selectivity
173    /// `selectivity` to this Precision. A selectivity of `1.0` means that all
174    /// rows are selected. A selectivity of `0.5` means half the rows are
175    /// selected. Will always return inexact statistics.
176    pub fn with_estimated_selectivity(self, selectivity: f64) -> Self {
177        self.map(|v| ((v as f64 * selectivity).ceil()) as usize)
178            .to_inexact()
179    }
180}
181
182impl Precision<ScalarValue> {
183    /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
184    /// conservatively propagating exactness information. If one of the input
185    /// values is [`Precision::Absent`], the result is `Absent` too.
186    pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
187        match (self, other) {
188            (Precision::Exact(a), Precision::Exact(b)) => {
189                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
190            }
191            (Precision::Inexact(a), Precision::Exact(b))
192            | (Precision::Exact(a), Precision::Inexact(b))
193            | (Precision::Inexact(a), Precision::Inexact(b)) => a
194                .add(b)
195                .map(Precision::Inexact)
196                .unwrap_or(Precision::Absent),
197            (_, _) => Precision::Absent,
198        }
199    }
200
201    /// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
202    /// conservatively propagating exactness information. If one of the input
203    /// values is [`Precision::Absent`], the result is `Absent` too.
204    pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
205        match (self, other) {
206            (Precision::Exact(a), Precision::Exact(b)) => {
207                a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
208            }
209            (Precision::Inexact(a), Precision::Exact(b))
210            | (Precision::Exact(a), Precision::Inexact(b))
211            | (Precision::Inexact(a), Precision::Inexact(b)) => a
212                .sub(b)
213                .map(Precision::Inexact)
214                .unwrap_or(Precision::Absent),
215            (_, _) => Precision::Absent,
216        }
217    }
218
219    /// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
220    /// conservatively propagating exactness information. If one of the input
221    /// values is [`Precision::Absent`], the result is `Absent` too.
222    pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
223        match (self, other) {
224            (Precision::Exact(a), Precision::Exact(b)) => a
225                .mul_checked(b)
226                .map(Precision::Exact)
227                .unwrap_or(Precision::Absent),
228            (Precision::Inexact(a), Precision::Exact(b))
229            | (Precision::Exact(a), Precision::Inexact(b))
230            | (Precision::Inexact(a), Precision::Inexact(b)) => a
231                .mul_checked(b)
232                .map(Precision::Inexact)
233                .unwrap_or(Precision::Absent),
234            (_, _) => Precision::Absent,
235        }
236    }
237
238    /// Casts the value to the given data type, propagating exactness information.
239    pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
240        match self {
241            Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
242            Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
243            Precision::Absent => Ok(Precision::Absent),
244        }
245    }
246}
247
248impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
249    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
250        match self {
251            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
252            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
253            Precision::Absent => write!(f, "Absent"),
254        }
255    }
256}
257
258impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
259    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
260        match self {
261            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
262            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
263            Precision::Absent => write!(f, "Absent"),
264        }
265    }
266}
267
268impl From<Precision<usize>> for Precision<ScalarValue> {
269    fn from(value: Precision<usize>) -> Self {
270        match value {
271            Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
272            Precision::Inexact(v) => {
273                Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
274            }
275            Precision::Absent => Precision::Absent,
276        }
277    }
278}
279
280/// Statistics for a relation
281/// Fields are optional and can be inexact because the sources
282/// sometimes provide approximate estimates for performance reasons
283/// and the transformations output are not always predictable.
284#[derive(Debug, Clone, PartialEq, Eq)]
285pub struct Statistics {
286    /// The number of table rows.
287    pub num_rows: Precision<usize>,
288    /// Total bytes of the table rows.
289    pub total_byte_size: Precision<usize>,
290    /// Statistics on a column level.
291    ///
292    /// It must contains a [`ColumnStatistics`] for each field in the schema of
293    /// the table to which the [`Statistics`] refer.
294    pub column_statistics: Vec<ColumnStatistics>,
295}
296
297impl Default for Statistics {
298    /// Returns a new [`Statistics`] instance with all fields set to unknown
299    /// and no columns.
300    fn default() -> Self {
301        Self {
302            num_rows: Precision::Absent,
303            total_byte_size: Precision::Absent,
304            column_statistics: vec![],
305        }
306    }
307}
308
309impl Statistics {
310    /// Returns a [`Statistics`] instance for the given schema by assigning
311    /// unknown statistics to each column in the schema.
312    pub fn new_unknown(schema: &Schema) -> Self {
313        Self {
314            num_rows: Precision::Absent,
315            total_byte_size: Precision::Absent,
316            column_statistics: Statistics::unknown_column(schema),
317        }
318    }
319
320    /// Returns an unbounded `ColumnStatistics` for each field in the schema.
321    pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
322        schema
323            .fields()
324            .iter()
325            .map(|_| ColumnStatistics::new_unknown())
326            .collect()
327    }
328
329    /// Set the number of rows
330    pub fn with_num_rows(mut self, num_rows: Precision<usize>) -> Self {
331        self.num_rows = num_rows;
332        self
333    }
334
335    /// Set the total size, in bytes
336    pub fn with_total_byte_size(mut self, total_byte_size: Precision<usize>) -> Self {
337        self.total_byte_size = total_byte_size;
338        self
339    }
340
341    /// Add a column to the column statistics
342    pub fn add_column_statistics(mut self, column_stats: ColumnStatistics) -> Self {
343        self.column_statistics.push(column_stats);
344        self
345    }
346
347    /// If the exactness of a [`Statistics`] instance is lost, this function relaxes
348    /// the exactness of all information by converting them [`Precision::Inexact`].
349    pub fn to_inexact(mut self) -> Self {
350        self.num_rows = self.num_rows.to_inexact();
351        self.total_byte_size = self.total_byte_size.to_inexact();
352        self.column_statistics = self
353            .column_statistics
354            .into_iter()
355            .map(|s| s.to_inexact())
356            .collect();
357        self
358    }
359
360    /// Project the statistics to the given column indices.
361    ///
362    /// For example, if we had statistics for columns `{"a", "b", "c"}`,
363    /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
364    /// "b"}`.
365    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
366        let Some(projection) = projection else {
367            return self;
368        };
369
370        #[allow(clippy::large_enum_variant)]
371        enum Slot {
372            /// The column is taken and put into the specified statistics location
373            Taken(usize),
374            /// The original columns is present
375            Present(ColumnStatistics),
376        }
377
378        // Convert to Vec<Slot> so we can avoid copying the statistics
379        let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
380            .into_iter()
381            .map(Slot::Present)
382            .collect();
383
384        for idx in projection {
385            let next_idx = self.column_statistics.len();
386            let slot = std::mem::replace(
387                columns.get_mut(*idx).expect("projection out of bounds"),
388                Slot::Taken(next_idx),
389            );
390            match slot {
391                // The column was there, so just move it
392                Slot::Present(col) => self.column_statistics.push(col),
393                // The column was taken, so copy from the previous location
394                Slot::Taken(prev_idx) => self
395                    .column_statistics
396                    .push(self.column_statistics[prev_idx].clone()),
397            }
398        }
399
400        self
401    }
402
403    /// Calculates the statistics after applying `fetch` and `skip` operations.
404    ///
405    /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
406    /// parameter to compute global statistics in a multi-partition setting.
407    pub fn with_fetch(
408        mut self,
409        fetch: Option<usize>,
410        skip: usize,
411        n_partitions: usize,
412    ) -> Result<Self> {
413        let fetch_val = fetch.unwrap_or(usize::MAX);
414
415        // Get the ratio of rows after / rows before on a per-partition basis
416        let num_rows_before = self.num_rows;
417
418        self.num_rows = match self {
419            Statistics {
420                num_rows: Precision::Exact(nr),
421                ..
422            }
423            | Statistics {
424                num_rows: Precision::Inexact(nr),
425                ..
426            } => {
427                // Here, the inexact case gives us an upper bound on the number of rows.
428                if nr <= skip {
429                    // All input data will be skipped:
430                    Precision::Exact(0)
431                } else if nr <= fetch_val && skip == 0 {
432                    // If the input does not reach the `fetch` globally, and `skip`
433                    // is zero (meaning the input and output are identical), return
434                    // input stats as is.
435                    // TODO: Can input stats still be used, but adjusted, when `skip`
436                    //       is non-zero?
437                    return Ok(self);
438                } else if nr - skip <= fetch_val {
439                    // After `skip` input rows are skipped, the remaining rows are
440                    // less than or equal to the `fetch` values, so `num_rows` must
441                    // equal the remaining rows.
442                    check_num_rows(
443                        (nr - skip).checked_mul(n_partitions),
444                        // We know that we have an estimate for the number of rows:
445                        self.num_rows.is_exact().unwrap(),
446                    )
447                } else {
448                    // At this point we know that we were given a `fetch` value
449                    // as the `None` case would go into the branch above. Since
450                    // the input has more rows than `fetch + skip`, the number
451                    // of rows will be the `fetch`, other statistics will have to be downgraded to inexact.
452                    check_num_rows(
453                        fetch_val.checked_mul(n_partitions),
454                        // We know that we have an estimate for the number of rows:
455                        self.num_rows.is_exact().unwrap(),
456                    )
457                }
458            }
459            Statistics {
460                num_rows: Precision::Absent,
461                ..
462            } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
463        };
464        let ratio: f64 = match (num_rows_before, self.num_rows) {
465            (
466                Precision::Exact(nr_before) | Precision::Inexact(nr_before),
467                Precision::Exact(nr_after) | Precision::Inexact(nr_after),
468            ) => {
469                if nr_before == 0 {
470                    0.0
471                } else {
472                    nr_after as f64 / nr_before as f64
473                }
474            }
475            _ => 0.0,
476        };
477        self.column_statistics = self
478            .column_statistics
479            .into_iter()
480            .map(ColumnStatistics::to_inexact)
481            .collect();
482        // Adjust the total_byte_size for the ratio of rows before and after, also marking it as inexact
483        self.total_byte_size = match &self.total_byte_size {
484            Precision::Exact(n) | Precision::Inexact(n) => {
485                let adjusted = (*n as f64 * ratio) as usize;
486                Precision::Inexact(adjusted)
487            }
488            Precision::Absent => Precision::Absent,
489        };
490        Ok(self)
491    }
492
493    /// Summarize zero or more statistics into a single `Statistics` instance.
494    ///
495    /// The method assumes that all statistics are for the same schema.
496    /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
497    ///
498    /// Returns an error if the statistics do not match the specified schemas.
499    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
500    where
501        I: IntoIterator<Item = &'a Statistics>,
502    {
503        let mut items = items.into_iter();
504
505        let Some(init) = items.next() else {
506            return Ok(Statistics::new_unknown(schema));
507        };
508        items.try_fold(init.clone(), |acc: Statistics, item_stats: &Statistics| {
509            acc.try_merge(item_stats)
510        })
511    }
512
513    /// Merge this Statistics value with another Statistics value.
514    ///
515    /// Returns an error if the statistics do not match (different schemas).
516    ///
517    /// # Example
518    /// ```
519    /// # use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
520    /// # use arrow::datatypes::{Field, Schema, DataType};
521    /// # use datafusion_common::stats::Precision;
522    /// let stats1 = Statistics::default()
523    ///     .with_num_rows(Precision::Exact(1))
524    ///     .with_total_byte_size(Precision::Exact(2))
525    ///     .add_column_statistics(
526    ///         ColumnStatistics::new_unknown()
527    ///             .with_null_count(Precision::Exact(3))
528    ///             .with_min_value(Precision::Exact(ScalarValue::from(4)))
529    ///             .with_max_value(Precision::Exact(ScalarValue::from(5))),
530    ///     );
531    ///
532    /// let stats2 = Statistics::default()
533    ///     .with_num_rows(Precision::Exact(10))
534    ///     .with_total_byte_size(Precision::Inexact(20))
535    ///     .add_column_statistics(
536    ///         ColumnStatistics::new_unknown()
537    ///             // absent null count
538    ///             .with_min_value(Precision::Exact(ScalarValue::from(40)))
539    ///             .with_max_value(Precision::Exact(ScalarValue::from(50))),
540    ///     );
541    ///
542    /// let merged_stats = stats1.try_merge(&stats2).unwrap();
543    /// let expected_stats = Statistics::default()
544    ///     .with_num_rows(Precision::Exact(11))
545    ///     .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact
546    ///     .add_column_statistics(
547    ///         ColumnStatistics::new_unknown()
548    ///             .with_null_count(Precision::Absent) // missing from stats2 --> absent
549    ///             .with_min_value(Precision::Exact(ScalarValue::from(4)))
550    ///             .with_max_value(Precision::Exact(ScalarValue::from(50))),
551    ///     );
552    ///
553    /// assert_eq!(merged_stats, expected_stats)
554    /// ```
555    pub fn try_merge(self, other: &Statistics) -> Result<Self> {
556        let Self {
557            mut num_rows,
558            mut total_byte_size,
559            mut column_statistics,
560        } = self;
561
562        // Accumulate statistics for subsequent items
563        num_rows = num_rows.add(&other.num_rows);
564        total_byte_size = total_byte_size.add(&other.total_byte_size);
565
566        if column_statistics.len() != other.column_statistics.len() {
567            return _plan_err!(
568                "Cannot merge statistics with different number of columns: {} vs {}",
569                column_statistics.len(),
570                other.column_statistics.len()
571            );
572        }
573
574        for (item_col_stats, col_stats) in other
575            .column_statistics
576            .iter()
577            .zip(column_statistics.iter_mut())
578        {
579            col_stats.null_count = col_stats.null_count.add(&item_col_stats.null_count);
580            col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
581            col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
582            col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
583            col_stats.distinct_count = Precision::Absent;
584        }
585
586        Ok(Statistics {
587            num_rows,
588            total_byte_size,
589            column_statistics,
590        })
591    }
592}
593
594/// Creates an estimate of the number of rows in the output using the given
595/// optional value and exactness flag.
596fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
597    if let Some(value) = value {
598        if is_exact {
599            Precision::Exact(value)
600        } else {
601            // If the input stats are inexact, so are the output stats.
602            Precision::Inexact(value)
603        }
604    } else {
605        // If the estimate is not available (e.g. due to an overflow), we can
606        // not produce a reliable estimate.
607        Precision::Absent
608    }
609}
610
611impl Display for Statistics {
612    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
613        // string of column statistics
614        let column_stats = self
615            .column_statistics
616            .iter()
617            .enumerate()
618            .map(|(i, cs)| {
619                let s = format!("(Col[{i}]:");
620                let s = if cs.min_value != Precision::Absent {
621                    format!("{} Min={}", s, cs.min_value)
622                } else {
623                    s
624                };
625                let s = if cs.max_value != Precision::Absent {
626                    format!("{} Max={}", s, cs.max_value)
627                } else {
628                    s
629                };
630                let s = if cs.sum_value != Precision::Absent {
631                    format!("{} Sum={}", s, cs.sum_value)
632                } else {
633                    s
634                };
635                let s = if cs.null_count != Precision::Absent {
636                    format!("{} Null={}", s, cs.null_count)
637                } else {
638                    s
639                };
640                let s = if cs.distinct_count != Precision::Absent {
641                    format!("{} Distinct={}", s, cs.distinct_count)
642                } else {
643                    s
644                };
645
646                s + ")"
647            })
648            .collect::<Vec<_>>()
649            .join(",");
650
651        write!(
652            f,
653            "Rows={}, Bytes={}, [{}]",
654            self.num_rows, self.total_byte_size, column_stats
655        )?;
656
657        Ok(())
658    }
659}
660
661/// Statistics for a column within a relation
662#[derive(Clone, Debug, PartialEq, Eq, Default)]
663pub struct ColumnStatistics {
664    /// Number of null values on column
665    pub null_count: Precision<usize>,
666    /// Maximum value of column
667    pub max_value: Precision<ScalarValue>,
668    /// Minimum value of column
669    pub min_value: Precision<ScalarValue>,
670    /// Sum value of a column
671    pub sum_value: Precision<ScalarValue>,
672    /// Number of distinct values
673    pub distinct_count: Precision<usize>,
674}
675
676impl ColumnStatistics {
677    /// Column contains a single non null value (e.g constant).
678    pub fn is_singleton(&self) -> bool {
679        match (&self.min_value, &self.max_value) {
680            // Min and max values are the same and not infinity.
681            (Precision::Exact(min), Precision::Exact(max)) => {
682                !min.is_null() && !max.is_null() && (min == max)
683            }
684            (_, _) => false,
685        }
686    }
687
688    /// Returns a [`ColumnStatistics`] instance having all [`Precision::Absent`] parameters.
689    pub fn new_unknown() -> Self {
690        Self {
691            null_count: Precision::Absent,
692            max_value: Precision::Absent,
693            min_value: Precision::Absent,
694            sum_value: Precision::Absent,
695            distinct_count: Precision::Absent,
696        }
697    }
698
699    /// Set the null count
700    pub fn with_null_count(mut self, null_count: Precision<usize>) -> Self {
701        self.null_count = null_count;
702        self
703    }
704
705    /// Set the max value
706    pub fn with_max_value(mut self, max_value: Precision<ScalarValue>) -> Self {
707        self.max_value = max_value;
708        self
709    }
710
711    /// Set the min value
712    pub fn with_min_value(mut self, min_value: Precision<ScalarValue>) -> Self {
713        self.min_value = min_value;
714        self
715    }
716
717    /// Set the sum value
718    pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
719        self.sum_value = sum_value;
720        self
721    }
722
723    /// Set the distinct count
724    pub fn with_distinct_count(mut self, distinct_count: Precision<usize>) -> Self {
725        self.distinct_count = distinct_count;
726        self
727    }
728
729    /// If the exactness of a [`ColumnStatistics`] instance is lost, this
730    /// function relaxes the exactness of all information by converting them
731    /// [`Precision::Inexact`].
732    pub fn to_inexact(mut self) -> Self {
733        self.null_count = self.null_count.to_inexact();
734        self.max_value = self.max_value.to_inexact();
735        self.min_value = self.min_value.to_inexact();
736        self.sum_value = self.sum_value.to_inexact();
737        self.distinct_count = self.distinct_count.to_inexact();
738        self
739    }
740}
741
742#[cfg(test)]
743mod tests {
744    use super::*;
745    use crate::assert_contains;
746    use arrow::datatypes::Field;
747    use std::sync::Arc;
748
749    #[test]
750    fn test_get_value() {
751        let exact_precision = Precision::Exact(42);
752        let inexact_precision = Precision::Inexact(23);
753        let absent_precision = Precision::<i32>::Absent;
754
755        assert_eq!(*exact_precision.get_value().unwrap(), 42);
756        assert_eq!(*inexact_precision.get_value().unwrap(), 23);
757        assert_eq!(absent_precision.get_value(), None);
758    }
759
760    #[test]
761    fn test_map() {
762        let exact_precision = Precision::Exact(42);
763        let inexact_precision = Precision::Inexact(23);
764        let absent_precision = Precision::Absent;
765
766        let squared = |x| x * x;
767
768        assert_eq!(exact_precision.map(squared), Precision::Exact(1764));
769        assert_eq!(inexact_precision.map(squared), Precision::Inexact(529));
770        assert_eq!(absent_precision.map(squared), Precision::Absent);
771    }
772
773    #[test]
774    fn test_is_exact() {
775        let exact_precision = Precision::Exact(42);
776        let inexact_precision = Precision::Inexact(23);
777        let absent_precision = Precision::<i32>::Absent;
778
779        assert_eq!(exact_precision.is_exact(), Some(true));
780        assert_eq!(inexact_precision.is_exact(), Some(false));
781        assert_eq!(absent_precision.is_exact(), None);
782    }
783
784    #[test]
785    fn test_max() {
786        let precision1 = Precision::Exact(42);
787        let precision2 = Precision::Inexact(23);
788        let precision3 = Precision::Exact(30);
789        let absent_precision = Precision::Absent;
790
791        assert_eq!(precision1.max(&precision2), Precision::Inexact(42));
792        assert_eq!(precision1.max(&precision3), Precision::Exact(42));
793        assert_eq!(precision2.max(&precision3), Precision::Inexact(30));
794        assert_eq!(precision1.max(&absent_precision), Precision::Absent);
795    }
796
797    #[test]
798    fn test_min() {
799        let precision1 = Precision::Exact(42);
800        let precision2 = Precision::Inexact(23);
801        let precision3 = Precision::Exact(30);
802        let absent_precision = Precision::Absent;
803
804        assert_eq!(precision1.min(&precision2), Precision::Inexact(23));
805        assert_eq!(precision1.min(&precision3), Precision::Exact(30));
806        assert_eq!(precision2.min(&precision3), Precision::Inexact(23));
807        assert_eq!(precision1.min(&absent_precision), Precision::Absent);
808    }
809
810    #[test]
811    fn test_to_inexact() {
812        let exact_precision = Precision::Exact(42);
813        let inexact_precision = Precision::Inexact(42);
814        let absent_precision = Precision::<i32>::Absent;
815
816        assert_eq!(exact_precision.to_inexact(), inexact_precision);
817        assert_eq!(inexact_precision.to_inexact(), inexact_precision);
818        assert_eq!(absent_precision.to_inexact(), absent_precision);
819    }
820
821    #[test]
822    fn test_add() {
823        let precision1 = Precision::Exact(42);
824        let precision2 = Precision::Inexact(23);
825        let precision3 = Precision::Exact(30);
826        let absent_precision = Precision::Absent;
827        let precision_max_exact = Precision::Exact(usize::MAX);
828        let precision_max_inexact = Precision::Exact(usize::MAX);
829
830        assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
831        assert_eq!(precision1.add(&precision3), Precision::Exact(72));
832        assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
833        assert_eq!(precision1.add(&absent_precision), Precision::Absent);
834        assert_eq!(
835            precision_max_exact.add(&precision1),
836            Precision::Inexact(usize::MAX)
837        );
838        assert_eq!(
839            precision_max_inexact.add(&precision1),
840            Precision::Inexact(usize::MAX)
841        );
842    }
843
844    #[test]
845    fn test_add_scalar() {
846        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
847
848        assert_eq!(
849            precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
850            Precision::Exact(ScalarValue::Int32(Some(65))),
851        );
852        assert_eq!(
853            precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
854            Precision::Inexact(ScalarValue::Int32(Some(65))),
855        );
856        assert_eq!(
857            precision.add(&Precision::Exact(ScalarValue::Int32(None))),
858            // As per behavior of ScalarValue::add
859            Precision::Exact(ScalarValue::Int32(None)),
860        );
861        assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
862    }
863
864    #[test]
865    fn test_sub() {
866        let precision1 = Precision::Exact(42);
867        let precision2 = Precision::Inexact(23);
868        let precision3 = Precision::Exact(30);
869        let absent_precision = Precision::Absent;
870
871        assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
872        assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
873        assert_eq!(precision2.sub(&precision1), Precision::Inexact(0));
874        assert_eq!(precision3.sub(&precision1), Precision::Inexact(0));
875        assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
876    }
877
878    #[test]
879    fn test_sub_scalar() {
880        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
881
882        assert_eq!(
883            precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
884            Precision::Exact(ScalarValue::Int32(Some(19))),
885        );
886        assert_eq!(
887            precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
888            Precision::Inexact(ScalarValue::Int32(Some(19))),
889        );
890        assert_eq!(
891            precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
892            // As per behavior of ScalarValue::sub
893            Precision::Exact(ScalarValue::Int32(None)),
894        );
895        assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
896    }
897
898    #[test]
899    fn test_multiply() {
900        let precision1 = Precision::Exact(6);
901        let precision2 = Precision::Inexact(3);
902        let precision3 = Precision::Exact(5);
903        let precision_max_exact = Precision::Exact(usize::MAX);
904        let precision_max_inexact = Precision::Exact(usize::MAX);
905        let absent_precision = Precision::Absent;
906
907        assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
908        assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
909        assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
910        assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
911        assert_eq!(
912            precision_max_exact.multiply(&precision1),
913            Precision::Inexact(usize::MAX)
914        );
915        assert_eq!(
916            precision_max_inexact.multiply(&precision1),
917            Precision::Inexact(usize::MAX)
918        );
919    }
920
921    #[test]
922    fn test_multiply_scalar() {
923        let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
924
925        assert_eq!(
926            precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
927            Precision::Exact(ScalarValue::Int32(Some(30))),
928        );
929        assert_eq!(
930            precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
931            Precision::Inexact(ScalarValue::Int32(Some(30))),
932        );
933        assert_eq!(
934            precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
935            // As per behavior of ScalarValue::mul_checked
936            Precision::Exact(ScalarValue::Int32(None)),
937        );
938        assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
939    }
940
941    #[test]
942    fn test_cast_to() {
943        // Valid
944        assert_eq!(
945            Precision::Exact(ScalarValue::Int32(Some(42)))
946                .cast_to(&DataType::Int64)
947                .unwrap(),
948            Precision::Exact(ScalarValue::Int64(Some(42))),
949        );
950        assert_eq!(
951            Precision::Inexact(ScalarValue::Int32(Some(42)))
952                .cast_to(&DataType::Int64)
953                .unwrap(),
954            Precision::Inexact(ScalarValue::Int64(Some(42))),
955        );
956        // Null
957        assert_eq!(
958            Precision::Exact(ScalarValue::Int32(None))
959                .cast_to(&DataType::Int64)
960                .unwrap(),
961            Precision::Exact(ScalarValue::Int64(None)),
962        );
963        // Overflow returns error
964        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
965            .cast_to(&DataType::Int8)
966            .is_err());
967    }
968
969    #[test]
970    fn test_precision_cloning() {
971        // Precision<usize> is copy
972        let precision: Precision<usize> = Precision::Exact(42);
973        let p2 = precision;
974        assert_eq!(precision, p2);
975
976        // Precision<ScalarValue> is not copy (requires .clone())
977        let precision: Precision<ScalarValue> =
978            Precision::Exact(ScalarValue::Int64(Some(42)));
979        // Clippy would complain about this if it were Copy
980        #[allow(clippy::redundant_clone)]
981        let p2 = precision.clone();
982        assert_eq!(precision, p2);
983    }
984
985    #[test]
986    fn test_project_none() {
987        let projection = None;
988        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
989        assert_eq!(stats, make_stats(vec![10, 20, 30]));
990    }
991
992    #[test]
993    fn test_project_empty() {
994        let projection = Some(vec![]);
995        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
996        assert_eq!(stats, make_stats(vec![]));
997    }
998
999    #[test]
1000    fn test_project_swap() {
1001        let projection = Some(vec![2, 1]);
1002        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1003        assert_eq!(stats, make_stats(vec![30, 20]));
1004    }
1005
1006    #[test]
1007    fn test_project_repeated() {
1008        let projection = Some(vec![1, 2, 1, 1, 0, 2]);
1009        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1010        assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
1011    }
1012
1013    // Make a Statistics structure with the specified null counts for each column
1014    fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
1015        Statistics {
1016            num_rows: Precision::Exact(42),
1017            total_byte_size: Precision::Exact(500),
1018            column_statistics: counts.into_iter().map(col_stats_i64).collect(),
1019        }
1020    }
1021
1022    fn col_stats_i64(null_count: usize) -> ColumnStatistics {
1023        ColumnStatistics {
1024            null_count: Precision::Exact(null_count),
1025            max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
1026            min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
1027            sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
1028            distinct_count: Precision::Exact(100),
1029        }
1030    }
1031
1032    #[test]
1033    fn test_try_merge_basic() {
1034        // Create a schema with two columns
1035        let schema = Arc::new(Schema::new(vec![
1036            Field::new("col1", DataType::Int32, false),
1037            Field::new("col2", DataType::Int32, false),
1038        ]));
1039
1040        // Create items with statistics
1041        let stats1 = Statistics {
1042            num_rows: Precision::Exact(10),
1043            total_byte_size: Precision::Exact(100),
1044            column_statistics: vec![
1045                ColumnStatistics {
1046                    null_count: Precision::Exact(1),
1047                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1048                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
1049                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1050                    distinct_count: Precision::Absent,
1051                },
1052                ColumnStatistics {
1053                    null_count: Precision::Exact(2),
1054                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
1055                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
1056                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
1057                    distinct_count: Precision::Absent,
1058                },
1059            ],
1060        };
1061
1062        let stats2 = Statistics {
1063            num_rows: Precision::Exact(15),
1064            total_byte_size: Precision::Exact(150),
1065            column_statistics: vec![
1066                ColumnStatistics {
1067                    null_count: Precision::Exact(2),
1068                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
1069                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1070                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
1071                    distinct_count: Precision::Absent,
1072                },
1073                ColumnStatistics {
1074                    null_count: Precision::Exact(3),
1075                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
1076                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
1077                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
1078                    distinct_count: Precision::Absent,
1079                },
1080            ],
1081        };
1082
1083        let items = vec![stats1, stats2];
1084
1085        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1086
1087        // Verify the results
1088        assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
1089        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 100 + 150
1090
1091        // Verify column statistics
1092        let col1_stats = &summary_stats.column_statistics[0];
1093        assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
1094        assert_eq!(
1095            col1_stats.max_value,
1096            Precision::Exact(ScalarValue::Int32(Some(120)))
1097        );
1098        assert_eq!(
1099            col1_stats.min_value,
1100            Precision::Exact(ScalarValue::Int32(Some(-10)))
1101        );
1102        assert_eq!(
1103            col1_stats.sum_value,
1104            Precision::Exact(ScalarValue::Int32(Some(1100)))
1105        ); // 500 + 600
1106
1107        let col2_stats = &summary_stats.column_statistics[1];
1108        assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
1109        assert_eq!(
1110            col2_stats.max_value,
1111            Precision::Exact(ScalarValue::Int32(Some(200)))
1112        );
1113        assert_eq!(
1114            col2_stats.min_value,
1115            Precision::Exact(ScalarValue::Int32(Some(5)))
1116        );
1117        assert_eq!(
1118            col2_stats.sum_value,
1119            Precision::Exact(ScalarValue::Int32(Some(2200)))
1120        ); // 1000 + 1200
1121    }
1122
1123    #[test]
1124    fn test_try_merge_mixed_precision() {
1125        // Create a schema with one column
1126        let schema = Arc::new(Schema::new(vec![Field::new(
1127            "col1",
1128            DataType::Int32,
1129            false,
1130        )]));
1131
1132        // Create items with different precision levels
1133        let stats1 = Statistics {
1134            num_rows: Precision::Exact(10),
1135            total_byte_size: Precision::Inexact(100),
1136            column_statistics: vec![ColumnStatistics {
1137                null_count: Precision::Exact(1),
1138                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1139                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1140                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1141                distinct_count: Precision::Absent,
1142            }],
1143        };
1144
1145        let stats2 = Statistics {
1146            num_rows: Precision::Inexact(15),
1147            total_byte_size: Precision::Exact(150),
1148            column_statistics: vec![ColumnStatistics {
1149                null_count: Precision::Inexact(2),
1150                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
1151                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1152                sum_value: Precision::Absent,
1153                distinct_count: Precision::Absent,
1154            }],
1155        };
1156
1157        let items = vec![stats1, stats2];
1158
1159        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1160
1161        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
1162        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
1163
1164        let col_stats = &summary_stats.column_statistics[0];
1165        assert_eq!(col_stats.null_count, Precision::Inexact(3));
1166        assert_eq!(
1167            col_stats.max_value,
1168            Precision::Inexact(ScalarValue::Int32(Some(120)))
1169        );
1170        assert_eq!(
1171            col_stats.min_value,
1172            Precision::Inexact(ScalarValue::Int32(Some(-10)))
1173        );
1174        assert!(matches!(col_stats.sum_value, Precision::Absent));
1175    }
1176
1177    #[test]
1178    fn test_try_merge_empty() {
1179        let schema = Arc::new(Schema::new(vec![Field::new(
1180            "col1",
1181            DataType::Int32,
1182            false,
1183        )]));
1184
1185        // Empty collection
1186        let items: Vec<Statistics> = vec![];
1187
1188        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1189
1190        // Verify default values for empty collection
1191        assert_eq!(summary_stats.num_rows, Precision::Absent);
1192        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
1193        assert_eq!(summary_stats.column_statistics.len(), 1);
1194        assert_eq!(
1195            summary_stats.column_statistics[0].null_count,
1196            Precision::Absent
1197        );
1198    }
1199
1200    #[test]
1201    fn test_try_merge_mismatched_size() {
1202        // Create a schema with one column
1203        let schema = Arc::new(Schema::new(vec![Field::new(
1204            "col1",
1205            DataType::Int32,
1206            false,
1207        )]));
1208
1209        // No column statistics
1210        let stats1 = Statistics::default();
1211
1212        let stats2 =
1213            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
1214
1215        let items = vec![stats1, stats2];
1216
1217        let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
1218        assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
1219    }
1220
1221    #[test]
1222    fn test_try_merge_distinct_count_absent() {
1223        // Create statistics with known distinct counts
1224        let stats1 = Statistics::default()
1225            .with_num_rows(Precision::Exact(10))
1226            .with_total_byte_size(Precision::Exact(100))
1227            .add_column_statistics(
1228                ColumnStatistics::new_unknown()
1229                    .with_null_count(Precision::Exact(0))
1230                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
1231                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
1232                    .with_distinct_count(Precision::Exact(5)),
1233            );
1234
1235        let stats2 = Statistics::default()
1236            .with_num_rows(Precision::Exact(15))
1237            .with_total_byte_size(Precision::Exact(150))
1238            .add_column_statistics(
1239                ColumnStatistics::new_unknown()
1240                    .with_null_count(Precision::Exact(0))
1241                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1242                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(20))))
1243                    .with_distinct_count(Precision::Exact(7)),
1244            );
1245
1246        // Merge statistics
1247        let merged_stats = stats1.try_merge(&stats2).unwrap();
1248
1249        // Verify the results
1250        assert_eq!(merged_stats.num_rows, Precision::Exact(25));
1251        assert_eq!(merged_stats.total_byte_size, Precision::Exact(250));
1252
1253        let col_stats = &merged_stats.column_statistics[0];
1254        assert_eq!(col_stats.null_count, Precision::Exact(0));
1255        assert_eq!(
1256            col_stats.min_value,
1257            Precision::Exact(ScalarValue::Int32(Some(1)))
1258        );
1259        assert_eq!(
1260            col_stats.max_value,
1261            Precision::Exact(ScalarValue::Int32(Some(20)))
1262        );
1263        // Distinct count should be Absent after merge
1264        assert_eq!(col_stats.distinct_count, Precision::Absent);
1265    }
1266
1267    #[test]
1268    fn test_with_fetch_basic_preservation() {
1269        // Test that column statistics and byte size are preserved (as inexact) when applying fetch
1270        let original_stats = Statistics {
1271            num_rows: Precision::Exact(1000),
1272            total_byte_size: Precision::Exact(8000),
1273            column_statistics: vec![
1274                ColumnStatistics {
1275                    null_count: Precision::Exact(10),
1276                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1277                    min_value: Precision::Exact(ScalarValue::Int32(Some(0))),
1278                    sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))),
1279                    distinct_count: Precision::Exact(50),
1280                },
1281                ColumnStatistics {
1282                    null_count: Precision::Exact(20),
1283                    max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
1284                    min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
1285                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))),
1286                    distinct_count: Precision::Exact(75),
1287                },
1288            ],
1289        };
1290
1291        // Apply fetch of 100 rows (10% of original)
1292        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
1293
1294        // Check num_rows
1295        assert_eq!(result.num_rows, Precision::Exact(100));
1296
1297        // Check total_byte_size is scaled proportionally and marked as inexact
1298        // 100/1000 = 0.1, so 8000 * 0.1 = 800
1299        assert_eq!(result.total_byte_size, Precision::Inexact(800));
1300
1301        // Check column statistics are preserved but marked as inexact
1302        assert_eq!(result.column_statistics.len(), 2);
1303
1304        // First column
1305        assert_eq!(
1306            result.column_statistics[0].null_count,
1307            Precision::Inexact(10)
1308        );
1309        assert_eq!(
1310            result.column_statistics[0].max_value,
1311            Precision::Inexact(ScalarValue::Int32(Some(100)))
1312        );
1313        assert_eq!(
1314            result.column_statistics[0].min_value,
1315            Precision::Inexact(ScalarValue::Int32(Some(0)))
1316        );
1317        assert_eq!(
1318            result.column_statistics[0].sum_value,
1319            Precision::Inexact(ScalarValue::Int32(Some(5050)))
1320        );
1321        assert_eq!(
1322            result.column_statistics[0].distinct_count,
1323            Precision::Inexact(50)
1324        );
1325
1326        // Second column
1327        assert_eq!(
1328            result.column_statistics[1].null_count,
1329            Precision::Inexact(20)
1330        );
1331        assert_eq!(
1332            result.column_statistics[1].max_value,
1333            Precision::Inexact(ScalarValue::Int64(Some(200)))
1334        );
1335        assert_eq!(
1336            result.column_statistics[1].min_value,
1337            Precision::Inexact(ScalarValue::Int64(Some(10)))
1338        );
1339        assert_eq!(
1340            result.column_statistics[1].sum_value,
1341            Precision::Inexact(ScalarValue::Int64(Some(10100)))
1342        );
1343        assert_eq!(
1344            result.column_statistics[1].distinct_count,
1345            Precision::Inexact(75)
1346        );
1347    }
1348
1349    #[test]
1350    fn test_with_fetch_inexact_input() {
1351        // Test that inexact input statistics remain inexact
1352        let original_stats = Statistics {
1353            num_rows: Precision::Inexact(1000),
1354            total_byte_size: Precision::Inexact(8000),
1355            column_statistics: vec![ColumnStatistics {
1356                null_count: Precision::Inexact(10),
1357                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
1358                min_value: Precision::Inexact(ScalarValue::Int32(Some(0))),
1359                sum_value: Precision::Inexact(ScalarValue::Int32(Some(5050))),
1360                distinct_count: Precision::Inexact(50),
1361            }],
1362        };
1363
1364        let result = original_stats.clone().with_fetch(Some(500), 0, 1).unwrap();
1365
1366        // Check num_rows is inexact
1367        assert_eq!(result.num_rows, Precision::Inexact(500));
1368
1369        // Check total_byte_size is scaled and inexact
1370        // 500/1000 = 0.5, so 8000 * 0.5 = 4000
1371        assert_eq!(result.total_byte_size, Precision::Inexact(4000));
1372
1373        // Column stats remain inexact
1374        assert_eq!(
1375            result.column_statistics[0].null_count,
1376            Precision::Inexact(10)
1377        );
1378    }
1379
1380    #[test]
1381    fn test_with_fetch_skip_all_rows() {
1382        // Test when skip >= num_rows (all rows are skipped)
1383        let original_stats = Statistics {
1384            num_rows: Precision::Exact(100),
1385            total_byte_size: Precision::Exact(800),
1386            column_statistics: vec![col_stats_i64(10)],
1387        };
1388
1389        let result = original_stats.clone().with_fetch(Some(50), 100, 1).unwrap();
1390
1391        assert_eq!(result.num_rows, Precision::Exact(0));
1392        // When ratio is 0/100 = 0, byte size should be 0
1393        assert_eq!(result.total_byte_size, Precision::Inexact(0));
1394    }
1395
1396    #[test]
1397    fn test_with_fetch_no_limit() {
1398        // Test when fetch is None and skip is 0 (no limit applied)
1399        let original_stats = Statistics {
1400            num_rows: Precision::Exact(100),
1401            total_byte_size: Precision::Exact(800),
1402            column_statistics: vec![col_stats_i64(10)],
1403        };
1404
1405        let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();
1406
1407        // Stats should be unchanged when no fetch and no skip
1408        assert_eq!(result.num_rows, Precision::Exact(100));
1409        assert_eq!(result.total_byte_size, Precision::Exact(800));
1410    }
1411
1412    #[test]
1413    fn test_with_fetch_with_skip() {
1414        // Test with both skip and fetch
1415        let original_stats = Statistics {
1416            num_rows: Precision::Exact(1000),
1417            total_byte_size: Precision::Exact(8000),
1418            column_statistics: vec![col_stats_i64(10)],
1419        };
1420
1421        // Skip 200, fetch 300, so we get rows 200-500
1422        let result = original_stats
1423            .clone()
1424            .with_fetch(Some(300), 200, 1)
1425            .unwrap();
1426
1427        assert_eq!(result.num_rows, Precision::Exact(300));
1428        // 300/1000 = 0.3, so 8000 * 0.3 = 2400
1429        assert_eq!(result.total_byte_size, Precision::Inexact(2400));
1430    }
1431
1432    #[test]
1433    fn test_with_fetch_multi_partition() {
1434        // Test with multiple partitions
1435        let original_stats = Statistics {
1436            num_rows: Precision::Exact(1000), // per partition
1437            total_byte_size: Precision::Exact(8000),
1438            column_statistics: vec![col_stats_i64(10)],
1439        };
1440
1441        // Fetch 100 per partition, 4 partitions = 400 total
1442        let result = original_stats.clone().with_fetch(Some(100), 0, 4).unwrap();
1443
1444        assert_eq!(result.num_rows, Precision::Exact(400));
1445        // 400/1000 = 0.4, so 8000 * 0.4 = 3200
1446        assert_eq!(result.total_byte_size, Precision::Inexact(3200));
1447    }
1448
1449    #[test]
1450    fn test_with_fetch_absent_stats() {
1451        // Test with absent statistics
1452        let original_stats = Statistics {
1453            num_rows: Precision::Absent,
1454            total_byte_size: Precision::Absent,
1455            column_statistics: vec![ColumnStatistics {
1456                null_count: Precision::Absent,
1457                max_value: Precision::Absent,
1458                min_value: Precision::Absent,
1459                sum_value: Precision::Absent,
1460                distinct_count: Precision::Absent,
1461            }],
1462        };
1463
1464        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
1465
1466        // With absent input stats, output should be inexact estimate
1467        assert_eq!(result.num_rows, Precision::Inexact(100));
1468        assert_eq!(result.total_byte_size, Precision::Absent);
1469        // Column stats should remain absent
1470        assert_eq!(result.column_statistics[0].null_count, Precision::Absent);
1471    }
1472
1473    #[test]
1474    fn test_with_fetch_fetch_exceeds_rows() {
1475        // Test when fetch is larger than available rows after skip
1476        let original_stats = Statistics {
1477            num_rows: Precision::Exact(100),
1478            total_byte_size: Precision::Exact(800),
1479            column_statistics: vec![col_stats_i64(10)],
1480        };
1481
1482        // Skip 50, fetch 100, but only 50 rows remain
1483        let result = original_stats.clone().with_fetch(Some(100), 50, 1).unwrap();
1484
1485        assert_eq!(result.num_rows, Precision::Exact(50));
1486        // 50/100 = 0.5, so 800 * 0.5 = 400
1487        assert_eq!(result.total_byte_size, Precision::Inexact(400));
1488    }
1489
1490    #[test]
1491    fn test_with_fetch_preserves_all_column_stats() {
1492        // Comprehensive test that all column statistic fields are preserved
1493        let original_col_stats = ColumnStatistics {
1494            null_count: Precision::Exact(42),
1495            max_value: Precision::Exact(ScalarValue::Int32(Some(999))),
1496            min_value: Precision::Exact(ScalarValue::Int32(Some(-100))),
1497            sum_value: Precision::Exact(ScalarValue::Int32(Some(123456))),
1498            distinct_count: Precision::Exact(789),
1499        };
1500
1501        let original_stats = Statistics {
1502            num_rows: Precision::Exact(1000),
1503            total_byte_size: Precision::Exact(8000),
1504            column_statistics: vec![original_col_stats.clone()],
1505        };
1506
1507        let result = original_stats.with_fetch(Some(250), 0, 1).unwrap();
1508
1509        let result_col_stats = &result.column_statistics[0];
1510
1511        // All values should be preserved but marked as inexact
1512        assert_eq!(result_col_stats.null_count, Precision::Inexact(42));
1513        assert_eq!(
1514            result_col_stats.max_value,
1515            Precision::Inexact(ScalarValue::Int32(Some(999)))
1516        );
1517        assert_eq!(
1518            result_col_stats.min_value,
1519            Precision::Inexact(ScalarValue::Int32(Some(-100)))
1520        );
1521        assert_eq!(
1522            result_col_stats.sum_value,
1523            Precision::Inexact(ScalarValue::Int32(Some(123456)))
1524        );
1525        assert_eq!(result_col_stats.distinct_count, Precision::Inexact(789));
1526    }
1527}