Skip to main content

datafusion_common/
stats.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! This module provides data structures to represent statistics
19
20use std::fmt::{self, Debug, Display};
21
22use crate::{Result, ScalarValue};
23
24use crate::error::_plan_err;
25use crate::utils::aggregate::precision_add;
26use arrow::datatypes::{DataType, Schema};
27
28/// Represents a value with a degree of certainty. `Precision` is used to
29/// propagate information the precision of statistical values.
30#[derive(Clone, PartialEq, Eq, Default, Copy)]
31pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
32    /// The exact value is known. Used for guaranteeing correctness.
33    ///
34    /// Comes from definitive sources such as:
35    /// - Parquet file metadata (row counts, byte sizes)
36    /// - In-memory RecordBatch data (actual row counts, byte sizes, null counts)
37    /// - and more...
38    Exact(T),
39    /// The value is not known exactly, but is likely close to this value.
40    /// Used for cost-based optimizations.
41    ///
42    /// Some operations that would result in `Inexact(T)` would be:
43    /// - Applying a filter (selectivity is unknown)
44    /// - Mixing exact and inexact values in arithmetic
45    /// - and more...
46    Inexact(T),
47    /// Nothing is known about the value. This is the default state.
48    ///
49    /// Acts as an absorbing element in arithmetic -> any operation
50    /// involving `Absent` yields `Absent`. [`Precision::to_inexact`]
51    /// on `Absent` returns `Absent`, not `Inexact` — it represents
52    /// a fundamentally different state.
53    ///
54    /// Common sources include:
55    /// - Data sources without statistics
56    /// - Parquet columns missing from file metadata
57    /// - Statistics that cannot be derived for an operation (e.g.,
58    ///   `distinct_count` after a union, `total_byte_size` for joins)
59    #[default]
60    Absent,
61}
62
63impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
64    /// If we have some value (exact or inexact), it returns that value.
65    /// Otherwise, it returns `None`.
66    pub fn get_value(&self) -> Option<&T> {
67        match self {
68            Precision::Exact(value) | Precision::Inexact(value) => Some(value),
69            Precision::Absent => None,
70        }
71    }
72
73    /// Transform the value in this [`Precision`] object, if one exists, using
74    /// the given function. Preserves the exactness state.
75    pub fn map<U, F>(self, f: F) -> Precision<U>
76    where
77        F: Fn(T) -> U,
78        U: Debug + Clone + PartialEq + Eq + PartialOrd,
79    {
80        match self {
81            Precision::Exact(val) => Precision::Exact(f(val)),
82            Precision::Inexact(val) => Precision::Inexact(f(val)),
83            _ => Precision::<U>::Absent,
84        }
85    }
86
87    /// Returns `Some(true)` if we have an exact value, `Some(false)` if we
88    /// have an inexact value, and `None` if there is no value.
89    pub fn is_exact(&self) -> Option<bool> {
90        match self {
91            Precision::Exact(_) => Some(true),
92            Precision::Inexact(_) => Some(false),
93            _ => None,
94        }
95    }
96
97    /// Returns the maximum of two (possibly inexact) values, conservatively
98    /// propagating exactness information. If one of the input values is
99    /// [`Precision::Absent`], the result is `Absent` too.
100    pub fn max(&self, other: &Precision<T>) -> Precision<T> {
101        match (self, other) {
102            (Precision::Exact(a), Precision::Exact(b)) => {
103                Precision::Exact(if a >= b { a.clone() } else { b.clone() })
104            }
105            (Precision::Inexact(a), Precision::Exact(b))
106            | (Precision::Exact(a), Precision::Inexact(b))
107            | (Precision::Inexact(a), Precision::Inexact(b)) => {
108                Precision::Inexact(if a >= b { a.clone() } else { b.clone() })
109            }
110            (_, _) => Precision::Absent,
111        }
112    }
113
114    /// Returns the minimum of two (possibly inexact) values, conservatively
115    /// propagating exactness information. If one of the input values is
116    /// [`Precision::Absent`], the result is `Absent` too.
117    pub fn min(&self, other: &Precision<T>) -> Precision<T> {
118        match (self, other) {
119            (Precision::Exact(a), Precision::Exact(b)) => {
120                Precision::Exact(if a >= b { b.clone() } else { a.clone() })
121            }
122            (Precision::Inexact(a), Precision::Exact(b))
123            | (Precision::Exact(a), Precision::Inexact(b))
124            | (Precision::Inexact(a), Precision::Inexact(b)) => {
125                Precision::Inexact(if a >= b { b.clone() } else { a.clone() })
126            }
127            (_, _) => Precision::Absent,
128        }
129    }
130
131    /// Demotes the precision state from exact to inexact (if present).
132    pub fn to_inexact(self) -> Self {
133        match self {
134            Precision::Exact(value) => Precision::Inexact(value),
135            _ => self,
136        }
137    }
138}
139
140impl Precision<usize> {
141    /// Calculates the sum of two (possibly inexact) [`usize`] values,
142    /// conservatively propagating exactness information. If one of the input
143    /// values is [`Precision::Absent`], the result is `Absent` too.
144    pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
145        match (self, other) {
146            (Precision::Exact(a), Precision::Exact(b)) => a.checked_add(*b).map_or_else(
147                || Precision::Inexact(a.saturating_add(*b)),
148                Precision::Exact,
149            ),
150            (Precision::Inexact(a), Precision::Exact(b))
151            | (Precision::Exact(a), Precision::Inexact(b))
152            | (Precision::Inexact(a), Precision::Inexact(b)) => {
153                Precision::Inexact(a.saturating_add(*b))
154            }
155            (_, _) => Precision::Absent,
156        }
157    }
158
159    /// Calculates the difference of two (possibly inexact) [`usize`] values,
160    /// conservatively propagating exactness information. If one of the input
161    /// values is [`Precision::Absent`], the result is `Absent` too.
162    pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
163        match (self, other) {
164            (Precision::Exact(a), Precision::Exact(b)) => a.checked_sub(*b).map_or_else(
165                || Precision::Inexact(a.saturating_sub(*b)),
166                Precision::Exact,
167            ),
168            (Precision::Inexact(a), Precision::Exact(b))
169            | (Precision::Exact(a), Precision::Inexact(b))
170            | (Precision::Inexact(a), Precision::Inexact(b)) => {
171                Precision::Inexact(a.saturating_sub(*b))
172            }
173            (_, _) => Precision::Absent,
174        }
175    }
176
177    /// Calculates the multiplication of two (possibly inexact) [`usize`] values,
178    /// conservatively propagating exactness information. If one of the input
179    /// values is [`Precision::Absent`], the result is `Absent` too.
180    pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
181        match (self, other) {
182            (Precision::Exact(a), Precision::Exact(b)) => a.checked_mul(*b).map_or_else(
183                || Precision::Inexact(a.saturating_mul(*b)),
184                Precision::Exact,
185            ),
186            (Precision::Inexact(a), Precision::Exact(b))
187            | (Precision::Exact(a), Precision::Inexact(b))
188            | (Precision::Inexact(a), Precision::Inexact(b)) => {
189                Precision::Inexact(a.saturating_mul(*b))
190            }
191            (_, _) => Precision::Absent,
192        }
193    }
194
195    /// Return the estimate of applying a filter with estimated selectivity
196    /// `selectivity` to this Precision. A selectivity of `1.0` means that all
197    /// rows are selected. A selectivity of `0.5` means half the rows are
198    /// selected. Will always return inexact statistics.
199    pub fn with_estimated_selectivity(self, selectivity: f64) -> Self {
200        self.map(|v| ((v as f64 * selectivity).ceil()) as usize)
201            .to_inexact()
202    }
203}
204
205impl Precision<ScalarValue> {
206    fn sum_data_type(data_type: &DataType) -> DataType {
207        match data_type {
208            DataType::Int8 | DataType::Int16 | DataType::Int32 => DataType::Int64,
209            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => DataType::UInt64,
210            _ => data_type.clone(),
211        }
212    }
213
214    fn cast_scalar_to_sum_type(value: &ScalarValue) -> Result<ScalarValue> {
215        let source_type = value.data_type();
216        let target_type = Self::sum_data_type(&source_type);
217        if source_type == target_type {
218            Ok(value.clone())
219        } else {
220            value.cast_to(&target_type)
221        }
222    }
223
224    /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
225    /// conservatively propagating exactness information. If one of the input
226    /// values is [`Precision::Absent`], the result is `Absent` too.
227    ///
228    /// Uses [`ScalarValue::add_checked`] so that integer overflow returns
229    /// an error (mapped to `Absent`) instead of silently wrapping.
230    ///
231    /// For performance-sensitive paths prefer `precision_add` which
232    /// avoids the Arrow array round-trip.
233    pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
234        match (self, other) {
235            (Precision::Exact(a), Precision::Exact(b)) => a
236                .add_checked(b)
237                .map(Precision::Exact)
238                .unwrap_or(Precision::Absent),
239            (Precision::Inexact(a), Precision::Exact(b))
240            | (Precision::Exact(a), Precision::Inexact(b))
241            | (Precision::Inexact(a), Precision::Inexact(b)) => a
242                .add_checked(b)
243                .map(Precision::Inexact)
244                .unwrap_or(Precision::Absent),
245            (_, _) => Precision::Absent,
246        }
247    }
248
249    /// Casts integer values to the wider SQL `SUM` return type.
250    ///
251    /// This narrows overflow risk when `sum_value` statistics are merged:
252    /// `Int8/Int16/Int32 -> Int64` and `UInt8/UInt16/UInt32 -> UInt64`.
253    pub fn cast_to_sum_type(&self) -> Precision<ScalarValue> {
254        match (self.is_exact(), self.get_value()) {
255            (Some(true), Some(value)) => Self::cast_scalar_to_sum_type(value)
256                .map(Precision::Exact)
257                .unwrap_or(Precision::Absent),
258            (Some(false), Some(value)) => Self::cast_scalar_to_sum_type(value)
259                .map(Precision::Inexact)
260                .unwrap_or(Precision::Absent),
261            (_, _) => Precision::Absent,
262        }
263    }
264
265    /// SUM-style addition with integer widening to match SQL `SUM` return
266    /// types for smaller integral inputs.
267    pub fn add_for_sum(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
268        let mut lhs = self.cast_to_sum_type();
269        let rhs = other.cast_to_sum_type();
270        precision_add(&mut lhs, &rhs);
271        lhs
272    }
273
274    /// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
275    /// conservatively propagating exactness information. If one of the input
276    /// values is [`Precision::Absent`], the result is `Absent` too.
277    pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
278        match (self, other) {
279            (Precision::Exact(a), Precision::Exact(b)) => {
280                a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
281            }
282            (Precision::Inexact(a), Precision::Exact(b))
283            | (Precision::Exact(a), Precision::Inexact(b))
284            | (Precision::Inexact(a), Precision::Inexact(b)) => a
285                .sub(b)
286                .map(Precision::Inexact)
287                .unwrap_or(Precision::Absent),
288            (_, _) => Precision::Absent,
289        }
290    }
291
292    /// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values,
293    /// conservatively propagating exactness information. If one of the input
294    /// values is [`Precision::Absent`], the result is `Absent` too.
295    pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
296        match (self, other) {
297            (Precision::Exact(a), Precision::Exact(b)) => a
298                .mul_checked(b)
299                .map(Precision::Exact)
300                .unwrap_or(Precision::Absent),
301            (Precision::Inexact(a), Precision::Exact(b))
302            | (Precision::Exact(a), Precision::Inexact(b))
303            | (Precision::Inexact(a), Precision::Inexact(b)) => a
304                .mul_checked(b)
305                .map(Precision::Inexact)
306                .unwrap_or(Precision::Absent),
307            (_, _) => Precision::Absent,
308        }
309    }
310
311    /// Casts the value to the given data type, propagating exactness information.
312    pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> {
313        match self {
314            Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact),
315            Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact),
316            Precision::Absent => Ok(Precision::Absent),
317        }
318    }
319}
320
321impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
322    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
323        match self {
324            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
325            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
326            Precision::Absent => write!(f, "Absent"),
327        }
328    }
329}
330
331impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
332    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
333        match self {
334            Precision::Exact(inner) => write!(f, "Exact({inner:?})"),
335            Precision::Inexact(inner) => write!(f, "Inexact({inner:?})"),
336            Precision::Absent => write!(f, "Absent"),
337        }
338    }
339}
340
341impl From<Precision<usize>> for Precision<ScalarValue> {
342    fn from(value: Precision<usize>) -> Self {
343        match value {
344            Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))),
345            Precision::Inexact(v) => {
346                Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
347            }
348            Precision::Absent => Precision::Absent,
349        }
350    }
351}
352
353/// Statistics for a relation
354/// Fields are optional and can be inexact because the sources
355/// sometimes provide approximate estimates for performance reasons
356/// and the transformations output are not always predictable.
357#[derive(Debug, Clone, PartialEq, Eq)]
358pub struct Statistics {
359    /// The number of rows estimated to be scanned.
360    pub num_rows: Precision<usize>,
361    /// The total bytes of the output data.
362    ///
363    /// Note that this is not the same as the total bytes that may be scanned,
364    /// processed, etc.
365    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
366    /// the node produces may be 2GB; it's this 2GB that is tracked here.
367    pub total_byte_size: Precision<usize>,
368    /// Statistics on a column level.
369    ///
370    /// It must contains a [`ColumnStatistics`] for each field in the schema of
371    /// the table to which the [`Statistics`] refer.
372    pub column_statistics: Vec<ColumnStatistics>,
373}
374
375/// Fallback to use when NDV overlap can not be estimated from column bounds.
376#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
377pub enum NdvFallback {
378    /// Use the larger input NDV. This is the conservative default for
379    /// related fragments such as files from the same table.
380    #[default]
381    Max,
382    /// Sum the input NDVs. This is a conservative upper bound for
383    /// independent inputs such as `UNION ALL`.
384    Sum,
385}
386
387impl NdvFallback {
388    fn merge(self, left: usize, right: usize) -> usize {
389        match self {
390            Self::Max => usize::max(left, right),
391            Self::Sum => left.saturating_add(right),
392        }
393    }
394}
395
396impl Default for Statistics {
397    /// Returns a new [`Statistics`] instance with all fields set to unknown
398    /// and no columns.
399    fn default() -> Self {
400        Self {
401            num_rows: Precision::Absent,
402            total_byte_size: Precision::Absent,
403            column_statistics: vec![],
404        }
405    }
406}
407
408impl Statistics {
409    /// Returns a [`Statistics`] instance for the given schema by assigning
410    /// unknown statistics to each column in the schema.
411    pub fn new_unknown(schema: &Schema) -> Self {
412        Self {
413            num_rows: Precision::Absent,
414            total_byte_size: Precision::Absent,
415            column_statistics: Statistics::unknown_column(schema),
416        }
417    }
418
419    /// Calculates `total_byte_size` based on the schema and `num_rows`.
420    /// If any of the columns has non-primitive width, `total_byte_size` is set to inexact.
421    pub fn calculate_total_byte_size(&mut self, schema: &Schema) {
422        let mut row_size = Some(0);
423        for field in schema.fields() {
424            match field.data_type().primitive_width() {
425                Some(width) => {
426                    row_size = row_size.map(|s| s + width);
427                }
428                None => {
429                    row_size = None;
430                    break;
431                }
432            }
433        }
434        match row_size {
435            None => {
436                self.total_byte_size = self.total_byte_size.to_inexact();
437            }
438            Some(size) => {
439                self.total_byte_size = self.num_rows.multiply(&Precision::Exact(size));
440            }
441        }
442    }
443
444    /// Returns an unbounded `ColumnStatistics` for each field in the schema.
445    pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
446        schema
447            .fields()
448            .iter()
449            .map(|_| ColumnStatistics::new_unknown())
450            .collect()
451    }
452
453    /// Set the number of rows
454    pub fn with_num_rows(mut self, num_rows: Precision<usize>) -> Self {
455        self.num_rows = num_rows;
456        self
457    }
458
459    /// Set the total size, in bytes
460    pub fn with_total_byte_size(mut self, total_byte_size: Precision<usize>) -> Self {
461        self.total_byte_size = total_byte_size;
462        self
463    }
464
465    /// Add a column to the column statistics
466    pub fn add_column_statistics(mut self, column_stats: ColumnStatistics) -> Self {
467        self.column_statistics.push(column_stats);
468        self
469    }
470
471    /// If the exactness of a [`Statistics`] instance is lost, this function relaxes
472    /// the exactness of all information by converting them [`Precision::Inexact`].
473    pub fn to_inexact(mut self) -> Self {
474        self.num_rows = self.num_rows.to_inexact();
475        self.total_byte_size = self.total_byte_size.to_inexact();
476        self.column_statistics = self
477            .column_statistics
478            .into_iter()
479            .map(|s| s.to_inexact())
480            .collect();
481        self
482    }
483
484    /// Project the statistics to the given column indices.
485    ///
486    /// For example, if we had statistics for columns `{"a", "b", "c"}`,
487    /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
488    /// "b"}`.
489    pub fn project(self, projection: Option<&impl AsRef<[usize]>>) -> Self {
490        let projection = projection.map(AsRef::as_ref);
491        self.project_impl(projection)
492    }
493
494    fn project_impl(mut self, projection: Option<&[usize]>) -> Self {
495        let Some(projection) = projection.map(AsRef::as_ref) else {
496            return self;
497        };
498
499        #[expect(clippy::large_enum_variant)]
500        enum Slot {
501            /// The column is taken and put into the specified statistics location
502            Taken(usize),
503            /// The original columns is present
504            Present(ColumnStatistics),
505        }
506
507        // Convert to Vec<Slot> so we can avoid copying the statistics
508        let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
509            .into_iter()
510            .map(Slot::Present)
511            .collect();
512
513        for idx in projection.iter() {
514            let next_idx = self.column_statistics.len();
515            let slot = std::mem::replace(
516                columns.get_mut(*idx).expect("projection out of bounds"),
517                Slot::Taken(next_idx),
518            );
519            match slot {
520                // The column was there, so just move it
521                Slot::Present(col) => self.column_statistics.push(col),
522                // The column was taken, so copy from the previous location
523                Slot::Taken(prev_idx) => self
524                    .column_statistics
525                    .push(self.column_statistics[prev_idx].clone()),
526            }
527        }
528
529        self
530    }
531
532    /// Calculates the statistics after applying `fetch` and `skip` operations.
533    ///
534    /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
535    /// parameter to compute global statistics in a multi-partition setting.
536    pub fn with_fetch(
537        mut self,
538        fetch: Option<usize>,
539        skip: usize,
540        n_partitions: usize,
541    ) -> Result<Self> {
542        let fetch_val = fetch.unwrap_or(usize::MAX);
543
544        // Get the ratio of rows after / rows before on a per-partition basis
545        let num_rows_before = self.num_rows;
546
547        self.num_rows = match self {
548            Statistics {
549                num_rows: Precision::Exact(nr),
550                ..
551            }
552            | Statistics {
553                num_rows: Precision::Inexact(nr),
554                ..
555            } => {
556                // Here, the inexact case gives us an estimate of the number of rows.
557                if nr <= skip {
558                    // All input data will be skipped. Preserve the exactness of
559                    // the input estimate: if the input was inexact, the
560                    // resulting zero is also inexact.
561                    check_num_rows(Some(0), self.num_rows.is_exact().unwrap())
562                } else if nr <= fetch_val && skip == 0 {
563                    // If the input does not reach the `fetch` globally, and `skip`
564                    // is zero (meaning the input and output are identical), return
565                    // input stats as is.
566                    // TODO: Can input stats still be used, but adjusted, when `skip`
567                    //       is non-zero?
568                    return Ok(self);
569                } else if nr - skip <= fetch_val {
570                    // After `skip` input rows are skipped, the remaining rows are
571                    // less than or equal to the `fetch` values, so `num_rows` must
572                    // equal the remaining rows.
573                    check_num_rows(
574                        (nr - skip).checked_mul(n_partitions),
575                        // We know that we have an estimate for the number of rows:
576                        self.num_rows.is_exact().unwrap(),
577                    )
578                } else {
579                    // At this point we know that we were given a `fetch` value
580                    // as the `None` case would go into the branch above. Since
581                    // the input has more rows than `fetch + skip`, the number
582                    // of rows will be the `fetch`, other statistics will have to be downgraded to inexact.
583                    check_num_rows(
584                        fetch_val.checked_mul(n_partitions),
585                        // We know that we have an estimate for the number of rows:
586                        self.num_rows.is_exact().unwrap(),
587                    )
588                }
589            }
590            Statistics {
591                num_rows: Precision::Absent,
592                ..
593            } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
594        };
595        let ratio: f64 = match (num_rows_before, self.num_rows) {
596            (
597                Precision::Exact(nr_before) | Precision::Inexact(nr_before),
598                Precision::Exact(nr_after) | Precision::Inexact(nr_after),
599            ) => {
600                if nr_before == 0 {
601                    0.0
602                } else {
603                    nr_after as f64 / nr_before as f64
604                }
605            }
606            _ => 0.0,
607        };
608        self.column_statistics = self
609            .column_statistics
610            .into_iter()
611            .map(|cs| {
612                let mut cs = cs.to_inexact();
613                // Scale byte_size by the row ratio
614                cs.byte_size = match cs.byte_size {
615                    Precision::Exact(n) | Precision::Inexact(n) => {
616                        Precision::Inexact((n as f64 * ratio) as usize)
617                    }
618                    Precision::Absent => Precision::Absent,
619                };
620                // NDV can never exceed the number of rows
621                if let Some(&rows) = self.num_rows.get_value() {
622                    cs.distinct_count = cs.distinct_count.min(&Precision::Inexact(rows));
623                }
624                cs
625            })
626            .collect();
627
628        // Compute total_byte_size as sum of column byte_size values if all are present,
629        // otherwise fall back to scaling the original total_byte_size
630        let sum_scan_bytes: Option<usize> = self
631            .column_statistics
632            .iter()
633            .map(|cs| cs.byte_size.get_value().copied())
634            .try_fold(0usize, |acc, val| val.map(|v| acc + v));
635
636        self.total_byte_size = match sum_scan_bytes {
637            Some(sum) => Precision::Inexact(sum),
638            None => {
639                // Fall back to scaling original total_byte_size if not all columns have byte_size
640                match &self.total_byte_size {
641                    Precision::Exact(n) | Precision::Inexact(n) => {
642                        Precision::Inexact((*n as f64 * ratio) as usize)
643                    }
644                    Precision::Absent => Precision::Absent,
645                }
646            }
647        };
648        Ok(self)
649    }
650
651    /// Summarize zero or more statistics into a single `Statistics` instance.
652    ///
653    /// The method assumes that all statistics are for the same schema.
654    /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
655    ///
656    /// This method uses [`NdvFallback::Max`] when `distinct_count` overlap
657    /// can not be estimated from column bounds.
658    ///
659    /// Returns an error if the statistics do not match the specified schemas.
660    ///
661    /// # Example
662    /// ```
663    /// # use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
664    /// # use arrow::datatypes::{Field, Schema, DataType};
665    /// # use datafusion_common::stats::Precision;
666    /// let stats1 = Statistics::default()
667    ///     .with_num_rows(Precision::Exact(10))
668    ///     .add_column_statistics(
669    ///         ColumnStatistics::new_unknown()
670    ///             .with_min_value(Precision::Exact(ScalarValue::from(1)))
671    ///             .with_max_value(Precision::Exact(ScalarValue::from(100)))
672    ///             .with_sum_value(Precision::Exact(ScalarValue::from(500))),
673    ///     );
674    ///
675    /// let stats2 = Statistics::default()
676    ///     .with_num_rows(Precision::Exact(20))
677    ///     .add_column_statistics(
678    ///         ColumnStatistics::new_unknown()
679    ///             .with_min_value(Precision::Exact(ScalarValue::from(5)))
680    ///             .with_max_value(Precision::Exact(ScalarValue::from(200)))
681    ///             .with_sum_value(Precision::Exact(ScalarValue::from(1000))),
682    ///     );
683    ///
684    /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
685    /// let merged = Statistics::try_merge_iter(
686    ///     &[stats1, stats2],
687    ///     &schema,
688    /// ).unwrap();
689    ///
690    /// assert_eq!(merged.num_rows, Precision::Exact(30));
691    /// assert_eq!(merged.column_statistics[0].min_value,
692    ///     Precision::Exact(ScalarValue::from(1)));
693    /// assert_eq!(merged.column_statistics[0].max_value,
694    ///     Precision::Exact(ScalarValue::from(200)));
695    /// assert_eq!(merged.column_statistics[0].sum_value,
696    ///     Precision::Exact(ScalarValue::Int64(Some(1500))));
697    /// ```
698    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
699    where
700        I: IntoIterator<Item = &'a Statistics>,
701    {
702        Self::try_merge_iter_with_ndv_fallback(items, schema, NdvFallback::Max)
703    }
704
705    /// Same as [`Statistics::try_merge_iter`], but lets callers choose the
706    /// fallback used when `distinct_count` overlap can not be estimated.
707    pub fn try_merge_iter_with_ndv_fallback<'a, I>(
708        items: I,
709        schema: &Schema,
710        ndv_fallback: NdvFallback,
711    ) -> Result<Statistics>
712    where
713        I: IntoIterator<Item = &'a Statistics>,
714    {
715        let mut items = items.into_iter();
716        let Some(first) = items.next() else {
717            return Ok(Statistics::new_unknown(schema));
718        };
719        let Some(second) = items.next() else {
720            return Ok(first.clone());
721        };
722
723        let num_cols = first.column_statistics.len();
724        let mut num_rows = first.num_rows;
725        let mut total_byte_size = first.total_byte_size;
726        let mut column_statistics = first.column_statistics.clone();
727        for col_stats in &mut column_statistics {
728            cast_sum_value_to_sum_type_in_place(&mut col_stats.sum_value);
729        }
730
731        // Merge the remaining items in a single pass.
732        for (i, stat) in std::iter::once(second).chain(items).enumerate() {
733            if stat.column_statistics.len() != num_cols {
734                return _plan_err!(
735                    "Cannot merge statistics with different number of columns: {} vs {} (item {})",
736                    num_cols,
737                    stat.column_statistics.len(),
738                    i + 1
739                );
740            }
741            num_rows = num_rows.add(&stat.num_rows);
742            total_byte_size = total_byte_size.add(&stat.total_byte_size);
743
744            // Uses precision_add for sum (reuses the lhs accumulator for
745            // direct numeric addition), while preserving the NDV update
746            // ordering required by estimate_ndv_with_overlap.
747            for (col_stats, item_cs) in
748                column_statistics.iter_mut().zip(&stat.column_statistics)
749            {
750                col_stats.null_count = col_stats.null_count.add(&item_cs.null_count);
751
752                // NDV must be computed before min/max update (needs pre-merge ranges)
753                col_stats.distinct_count = match (
754                    col_stats.distinct_count.get_value(),
755                    item_cs.distinct_count.get_value(),
756                ) {
757                    (Some(&l), Some(&r)) => Precision::Inexact(
758                        estimate_ndv_with_overlap(col_stats, item_cs, l, r)
759                            .unwrap_or_else(|| ndv_fallback.merge(l, r)),
760                    ),
761                    _ => Precision::Absent,
762                };
763                precision_min(&mut col_stats.min_value, &item_cs.min_value);
764                precision_max(&mut col_stats.max_value, &item_cs.max_value);
765                precision_add_for_sum_in_place(
766                    &mut col_stats.sum_value,
767                    &item_cs.sum_value,
768                );
769                col_stats.byte_size = col_stats.byte_size.add(&item_cs.byte_size);
770            }
771        }
772
773        Ok(Statistics {
774            num_rows,
775            total_byte_size,
776            column_statistics,
777        })
778    }
779}
780
781/// Estimates the combined number of distinct values (NDV) when merging two
782/// column statistics, using range overlap to avoid double-counting shared values.
783///
784/// Assumes values are distributed uniformly within each input's
785/// `[min, max]` range (the standard assumption when only summary
786/// statistics are available). Under uniformity the fraction of an input's
787/// distinct values that land in a sub-range equals the fraction of
788/// the range that sub-range covers.
789///
790/// The combined value space is split into three disjoint regions:
791///
792/// ```text
793///   |-- only A --|-- overlap --|-- only B --|
794/// ```
795///
796/// * **Only in A/B** - values outside the other input's range
797///   contribute `(1 - overlap_a) * NDV_a` and `(1 - overlap_b) * NDV_b`.
798/// * **Overlap** - both inputs may produce values here. We take
799///   `max(overlap_a * NDV_a, overlap_b * NDV_b)` rather than the
800///   sum because values in the same sub-range are likely shared
801///   (the smaller set is assumed to be a subset of the larger).
802///
803/// The formula ranges between `[max(NDV_a, NDV_b), NDV_a + NDV_b]`,
804/// from full overlap to no overlap.
805///
806/// ```text
807/// NDV = max(overlap_a * NDV_a, overlap_b * NDV_b)   [intersection]
808///     + (1 - overlap_a) * NDV_a                      [only in A]
809///     + (1 - overlap_b) * NDV_b                      [only in B]
810/// ```
811///
812/// Returns `None` when min/max are absent or distance is unsupported
813/// (e.g. strings), in which case the caller should fall back to a simpler
814/// estimate.
815pub fn estimate_ndv_with_overlap(
816    left: &ColumnStatistics,
817    right: &ColumnStatistics,
818    ndv_left: usize,
819    ndv_right: usize,
820) -> Option<usize> {
821    let left_min = left.min_value.get_value()?;
822    let left_max = left.max_value.get_value()?;
823    let right_min = right.min_value.get_value()?;
824    let right_max = right.max_value.get_value()?;
825
826    let range_left = left_max.distance(left_min)?;
827    let range_right = right_max.distance(right_min)?;
828
829    // Constant columns (range == 0) can't use the proportional overlap
830    // formula below, so check interval overlap directly instead.
831    if range_left == 0 || range_right == 0 {
832        let overlaps = left_min <= right_max && right_min <= left_max;
833        return Some(if overlaps {
834            usize::max(ndv_left, ndv_right)
835        } else {
836            ndv_left + ndv_right
837        });
838    }
839
840    let overlap_min = if left_min >= right_min {
841        left_min
842    } else {
843        right_min
844    };
845    let overlap_max = if left_max <= right_max {
846        left_max
847    } else {
848        right_max
849    };
850
851    // Disjoint ranges: no overlap, NDVs are additive
852    if overlap_min > overlap_max {
853        return Some(ndv_left + ndv_right);
854    }
855
856    let overlap_range = overlap_max.distance(overlap_min)? as f64;
857
858    let overlap_left = overlap_range / range_left as f64;
859    let overlap_right = overlap_range / range_right as f64;
860
861    let intersection = f64::max(
862        overlap_left * ndv_left as f64,
863        overlap_right * ndv_right as f64,
864    );
865    let only_left = (1.0 - overlap_left) * ndv_left as f64;
866    let only_right = (1.0 - overlap_right) * ndv_right as f64;
867
868    Some((intersection + only_left + only_right).round() as usize)
869}
870
871/// Returns the minimum precision while not allocating a new value,
872/// mirrors the semantics of `PartialOrd`.
873#[inline]
874fn precision_min<T>(lhs: &mut Precision<T>, rhs: &Precision<T>)
875where
876    T: Debug + Clone + PartialEq + Eq + PartialOrd,
877{
878    *lhs = match (std::mem::take(lhs), rhs) {
879        (Precision::Exact(left), Precision::Exact(right)) => {
880            if left <= *right {
881                Precision::Exact(left)
882            } else {
883                Precision::Exact(right.clone())
884            }
885        }
886        (Precision::Exact(left), Precision::Inexact(right))
887        | (Precision::Inexact(left), Precision::Exact(right))
888        | (Precision::Inexact(left), Precision::Inexact(right)) => {
889            if left <= *right {
890                Precision::Inexact(left)
891            } else {
892                Precision::Inexact(right.clone())
893            }
894        }
895        (_, _) => Precision::Absent,
896    };
897}
898
899/// Returns the maximum precision while not allocating a new value,
900/// mirrors the semantics of `PartialOrd`.
901#[inline]
902fn precision_max<T>(lhs: &mut Precision<T>, rhs: &Precision<T>)
903where
904    T: Debug + Clone + PartialEq + Eq + PartialOrd,
905{
906    *lhs = match (std::mem::take(lhs), rhs) {
907        (Precision::Exact(left), Precision::Exact(right)) => {
908            if left >= *right {
909                Precision::Exact(left)
910            } else {
911                Precision::Exact(right.clone())
912            }
913        }
914        (Precision::Exact(left), Precision::Inexact(right))
915        | (Precision::Inexact(left), Precision::Exact(right))
916        | (Precision::Inexact(left), Precision::Inexact(right)) => {
917            if left >= *right {
918                Precision::Inexact(left)
919            } else {
920                Precision::Inexact(right.clone())
921            }
922        }
923        (_, _) => Precision::Absent,
924    };
925}
926
927#[inline]
928fn cast_sum_value_to_sum_type_in_place(value: &mut Precision<ScalarValue>) {
929    let (is_exact, inner) = match std::mem::take(value) {
930        Precision::Exact(v) => (true, v),
931        Precision::Inexact(v) => (false, v),
932        Precision::Absent => return,
933    };
934    let source_type = inner.data_type();
935    let target_type = Precision::<ScalarValue>::sum_data_type(&source_type);
936
937    let wrap_precision_fn: fn(ScalarValue) -> Precision<ScalarValue> = if is_exact {
938        Precision::Exact
939    } else {
940        Precision::Inexact
941    };
942
943    *value = if source_type == target_type {
944        wrap_precision_fn(inner)
945    } else {
946        inner
947            .cast_to(&target_type)
948            .map(wrap_precision_fn)
949            .unwrap_or(Precision::Absent)
950    };
951}
952
953#[inline]
954fn precision_add_for_sum_in_place(
955    lhs: &mut Precision<ScalarValue>,
956    rhs: &Precision<ScalarValue>,
957) {
958    let (value, wrap_fn): (&ScalarValue, fn(ScalarValue) -> Precision<ScalarValue>) =
959        match rhs {
960            Precision::Exact(v) => (v, Precision::Exact),
961            Precision::Inexact(v) => (v, Precision::Inexact),
962            Precision::Absent => {
963                *lhs = Precision::Absent;
964                return;
965            }
966        };
967    let source_type = value.data_type();
968    let target_type = Precision::<ScalarValue>::sum_data_type(&source_type);
969    if source_type == target_type {
970        precision_add(lhs, rhs);
971    } else {
972        let rhs = value
973            .cast_to(&target_type)
974            .map(wrap_fn)
975            .unwrap_or(Precision::Absent);
976        precision_add(lhs, &rhs);
977    }
978}
979
980/// Creates an estimate of the number of rows in the output using the given
981/// optional value and exactness flag.
982fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
983    if let Some(value) = value {
984        if is_exact {
985            Precision::Exact(value)
986        } else {
987            // If the input stats are inexact, so are the output stats.
988            Precision::Inexact(value)
989        }
990    } else {
991        // If the estimate is not available (e.g. due to an overflow), we can
992        // not produce a reliable estimate.
993        Precision::Absent
994    }
995}
996
997impl Display for Statistics {
998    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
999        // string of column statistics
1000        let column_stats = self
1001            .column_statistics
1002            .iter()
1003            .enumerate()
1004            .map(|(i, cs)| {
1005                let s = format!("(Col[{i}]:");
1006                let s = if cs.min_value != Precision::Absent {
1007                    format!("{} Min={}", s, cs.min_value)
1008                } else {
1009                    s
1010                };
1011                let s = if cs.max_value != Precision::Absent {
1012                    format!("{} Max={}", s, cs.max_value)
1013                } else {
1014                    s
1015                };
1016                let s = if cs.sum_value != Precision::Absent {
1017                    format!("{} Sum={}", s, cs.sum_value)
1018                } else {
1019                    s
1020                };
1021                let s = if cs.null_count != Precision::Absent {
1022                    format!("{} Null={}", s, cs.null_count)
1023                } else {
1024                    s
1025                };
1026                let s = if cs.distinct_count != Precision::Absent {
1027                    format!("{} Distinct={}", s, cs.distinct_count)
1028                } else {
1029                    s
1030                };
1031                let s = if cs.byte_size != Precision::Absent {
1032                    format!("{} ScanBytes={}", s, cs.byte_size)
1033                } else {
1034                    s
1035                };
1036
1037                s + ")"
1038            })
1039            .collect::<Vec<_>>()
1040            .join(",");
1041
1042        write!(
1043            f,
1044            "Rows={}, Bytes={}, [{}]",
1045            self.num_rows, self.total_byte_size, column_stats
1046        )?;
1047
1048        Ok(())
1049    }
1050}
1051
1052/// Statistics for a column within a relation
1053#[derive(Clone, Debug, PartialEq, Eq, Default)]
1054pub struct ColumnStatistics {
1055    /// Number of null values on column
1056    pub null_count: Precision<usize>,
1057    /// Maximum value of column
1058    pub max_value: Precision<ScalarValue>,
1059    /// Minimum value of column
1060    pub min_value: Precision<ScalarValue>,
1061    /// Sum value of a column.
1062    ///
1063    /// For integral columns, values should be kept in SUM-compatible widened
1064    /// types (`Int8/Int16/Int32 -> Int64`, `UInt8/UInt16/UInt32 -> UInt64`) to
1065    /// reduce overflow risk during statistics propagation.
1066    ///
1067    /// Callers should prefer [`ColumnStatistics::with_sum_value`] for setting
1068    /// this field and [`Precision<ScalarValue>::add_for_sum`] /
1069    /// [`Precision<ScalarValue>::cast_to_sum_type`] for sum arithmetic.
1070    pub sum_value: Precision<ScalarValue>,
1071    /// Number of distinct values
1072    pub distinct_count: Precision<usize>,
1073    /// Estimated size of this column's data in bytes for the output.
1074    ///
1075    /// Note that this is not the same as the total bytes that may be scanned,
1076    /// processed, etc.
1077    ///
1078    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
1079    /// the node produces may be 2GB; it's this 2GB that is tracked here.
1080    ///
1081    /// Currently this is accurately calculated for primitive types only.
1082    /// For complex types (like Utf8, List, Struct, etc), this value may be
1083    /// absent or inexact (e.g. estimated from the size of the data in the source Parquet files).
1084    ///
1085    /// This value is automatically scaled when operations like limits or
1086    /// filters reduce the number of rows (see [`Statistics::with_fetch`]).
1087    pub byte_size: Precision<usize>,
1088}
1089
1090impl ColumnStatistics {
1091    /// Column contains a single non null value (e.g constant).
1092    pub fn is_singleton(&self) -> bool {
1093        match (&self.min_value, &self.max_value) {
1094            // Min and max values are the same and not infinity.
1095            (Precision::Exact(min), Precision::Exact(max)) => {
1096                !min.is_null() && !max.is_null() && (min == max)
1097            }
1098            (_, _) => false,
1099        }
1100    }
1101
1102    /// Returns a [`ColumnStatistics`] instance having all [`Precision::Absent`] parameters.
1103    pub fn new_unknown() -> Self {
1104        Self {
1105            null_count: Precision::Absent,
1106            max_value: Precision::Absent,
1107            min_value: Precision::Absent,
1108            sum_value: Precision::Absent,
1109            distinct_count: Precision::Absent,
1110            byte_size: Precision::Absent,
1111        }
1112    }
1113
1114    /// Set the null count
1115    pub fn with_null_count(mut self, null_count: Precision<usize>) -> Self {
1116        self.null_count = null_count;
1117        self
1118    }
1119
1120    /// Set the max value
1121    pub fn with_max_value(mut self, max_value: Precision<ScalarValue>) -> Self {
1122        self.max_value = max_value;
1123        self
1124    }
1125
1126    /// Set the min value
1127    pub fn with_min_value(mut self, min_value: Precision<ScalarValue>) -> Self {
1128        self.min_value = min_value;
1129        self
1130    }
1131
1132    /// Set the sum value
1133    pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
1134        self.sum_value = match sum_value {
1135            Precision::Exact(value) => {
1136                Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
1137                    .map(Precision::Exact)
1138                    .unwrap_or(Precision::Absent)
1139            }
1140            Precision::Inexact(value) => {
1141                Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
1142                    .map(Precision::Inexact)
1143                    .unwrap_or(Precision::Absent)
1144            }
1145            Precision::Absent => Precision::Absent,
1146        };
1147        self
1148    }
1149
1150    /// Set the distinct count
1151    pub fn with_distinct_count(mut self, distinct_count: Precision<usize>) -> Self {
1152        self.distinct_count = distinct_count;
1153        self
1154    }
1155
1156    /// Set the scan byte size
1157    /// This should initially be set to the total size of the column.
1158    pub fn with_byte_size(mut self, byte_size: Precision<usize>) -> Self {
1159        self.byte_size = byte_size;
1160        self
1161    }
1162
1163    /// If the exactness of a [`ColumnStatistics`] instance is lost, this
1164    /// function relaxes the exactness of all information by converting them
1165    /// [`Precision::Inexact`].
1166    pub fn to_inexact(mut self) -> Self {
1167        self.null_count = self.null_count.to_inexact();
1168        self.max_value = self.max_value.to_inexact();
1169        self.min_value = self.min_value.to_inexact();
1170        self.sum_value = self.sum_value.to_inexact();
1171        self.distinct_count = self.distinct_count.to_inexact();
1172        self.byte_size = self.byte_size.to_inexact();
1173        self
1174    }
1175}
1176
1177#[cfg(test)]
1178mod tests {
1179    use super::*;
1180    use crate::assert_contains;
1181    use arrow::datatypes::Field;
1182    use std::sync::Arc;
1183
1184    #[test]
1185    fn test_get_value() {
1186        let exact_precision = Precision::Exact(42);
1187        let inexact_precision = Precision::Inexact(23);
1188        let absent_precision = Precision::<i32>::Absent;
1189
1190        assert_eq!(*exact_precision.get_value().unwrap(), 42);
1191        assert_eq!(*inexact_precision.get_value().unwrap(), 23);
1192        assert_eq!(absent_precision.get_value(), None);
1193    }
1194
1195    #[test]
1196    fn test_map() {
1197        let exact_precision = Precision::Exact(42);
1198        let inexact_precision = Precision::Inexact(23);
1199        let absent_precision = Precision::Absent;
1200
1201        let squared = |x| x * x;
1202
1203        assert_eq!(exact_precision.map(squared), Precision::Exact(1764));
1204        assert_eq!(inexact_precision.map(squared), Precision::Inexact(529));
1205        assert_eq!(absent_precision.map(squared), Precision::Absent);
1206    }
1207
1208    #[test]
1209    fn test_is_exact() {
1210        let exact_precision = Precision::Exact(42);
1211        let inexact_precision = Precision::Inexact(23);
1212        let absent_precision = Precision::<i32>::Absent;
1213
1214        assert_eq!(exact_precision.is_exact(), Some(true));
1215        assert_eq!(inexact_precision.is_exact(), Some(false));
1216        assert_eq!(absent_precision.is_exact(), None);
1217    }
1218
1219    #[test]
1220    fn test_max() {
1221        let precision1 = Precision::Exact(42);
1222        let precision2 = Precision::Inexact(23);
1223        let precision3 = Precision::Exact(30);
1224        let absent_precision = Precision::Absent;
1225
1226        assert_eq!(precision1.max(&precision2), Precision::Inexact(42));
1227        assert_eq!(precision1.max(&precision3), Precision::Exact(42));
1228        assert_eq!(precision2.max(&precision3), Precision::Inexact(30));
1229        assert_eq!(precision1.max(&absent_precision), Precision::Absent);
1230    }
1231
1232    #[test]
1233    fn test_min() {
1234        let precision1 = Precision::Exact(42);
1235        let precision2 = Precision::Inexact(23);
1236        let precision3 = Precision::Exact(30);
1237        let absent_precision = Precision::Absent;
1238
1239        assert_eq!(precision1.min(&precision2), Precision::Inexact(23));
1240        assert_eq!(precision1.min(&precision3), Precision::Exact(30));
1241        assert_eq!(precision2.min(&precision3), Precision::Inexact(23));
1242        assert_eq!(precision1.min(&absent_precision), Precision::Absent);
1243    }
1244
1245    #[test]
1246    fn test_to_inexact() {
1247        let exact_precision = Precision::Exact(42);
1248        let inexact_precision = Precision::Inexact(42);
1249        let absent_precision = Precision::<i32>::Absent;
1250
1251        assert_eq!(exact_precision.to_inexact(), inexact_precision);
1252        assert_eq!(inexact_precision.to_inexact(), inexact_precision);
1253        assert_eq!(absent_precision.to_inexact(), absent_precision);
1254    }
1255
1256    #[test]
1257    fn test_add() {
1258        let precision1 = Precision::Exact(42);
1259        let precision2 = Precision::Inexact(23);
1260        let precision3 = Precision::Exact(30);
1261        let absent_precision = Precision::Absent;
1262        let precision_max_exact = Precision::Exact(usize::MAX);
1263        let precision_max_inexact = Precision::Exact(usize::MAX);
1264
1265        assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
1266        assert_eq!(precision1.add(&precision3), Precision::Exact(72));
1267        assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
1268        assert_eq!(precision1.add(&absent_precision), Precision::Absent);
1269        assert_eq!(
1270            precision_max_exact.add(&precision1),
1271            Precision::Inexact(usize::MAX)
1272        );
1273        assert_eq!(
1274            precision_max_inexact.add(&precision1),
1275            Precision::Inexact(usize::MAX)
1276        );
1277    }
1278
1279    #[test]
1280    fn test_add_scalar() {
1281        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
1282
1283        assert_eq!(
1284            precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
1285            Precision::Exact(ScalarValue::Int32(Some(65))),
1286        );
1287        assert_eq!(
1288            precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
1289            Precision::Inexact(ScalarValue::Int32(Some(65))),
1290        );
1291        assert_eq!(
1292            precision.add(&Precision::Exact(ScalarValue::Int32(None))),
1293            // As per behavior of ScalarValue::add
1294            Precision::Exact(ScalarValue::Int32(None)),
1295        );
1296        assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
1297    }
1298
1299    #[test]
1300    fn test_add_for_sum_scalar_integer_widening() {
1301        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
1302
1303        assert_eq!(
1304            precision.add_for_sum(&Precision::Exact(ScalarValue::Int32(Some(23)))),
1305            Precision::Exact(ScalarValue::Int64(Some(65))),
1306        );
1307        assert_eq!(
1308            precision.add_for_sum(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
1309            Precision::Inexact(ScalarValue::Int64(Some(65))),
1310        );
1311    }
1312
1313    #[test]
1314    fn test_add_for_sum_prevents_int32_overflow() {
1315        let lhs = Precision::Exact(ScalarValue::Int32(Some(i32::MAX)));
1316        let rhs = Precision::Exact(ScalarValue::Int32(Some(1)));
1317
1318        assert_eq!(
1319            lhs.add_for_sum(&rhs),
1320            Precision::Exact(ScalarValue::Int64(Some(i64::from(i32::MAX) + 1))),
1321        );
1322    }
1323
1324    #[test]
1325    fn test_add_for_sum_scalar_unsigned_integer_widening() {
1326        let precision = Precision::Exact(ScalarValue::UInt32(Some(42)));
1327
1328        assert_eq!(
1329            precision.add_for_sum(&Precision::Exact(ScalarValue::UInt32(Some(23)))),
1330            Precision::Exact(ScalarValue::UInt64(Some(65))),
1331        );
1332        assert_eq!(
1333            precision.add_for_sum(&Precision::Inexact(ScalarValue::UInt32(Some(23)))),
1334            Precision::Inexact(ScalarValue::UInt64(Some(65))),
1335        );
1336    }
1337
1338    #[test]
1339    fn test_sub() {
1340        let precision1 = Precision::Exact(42);
1341        let precision2 = Precision::Inexact(23);
1342        let precision3 = Precision::Exact(30);
1343        let absent_precision = Precision::Absent;
1344
1345        assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
1346        assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
1347        assert_eq!(precision2.sub(&precision1), Precision::Inexact(0));
1348        assert_eq!(precision3.sub(&precision1), Precision::Inexact(0));
1349        assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
1350    }
1351
1352    #[test]
1353    fn test_sub_scalar() {
1354        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
1355
1356        assert_eq!(
1357            precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
1358            Precision::Exact(ScalarValue::Int32(Some(19))),
1359        );
1360        assert_eq!(
1361            precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
1362            Precision::Inexact(ScalarValue::Int32(Some(19))),
1363        );
1364        assert_eq!(
1365            precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
1366            // As per behavior of ScalarValue::sub
1367            Precision::Exact(ScalarValue::Int32(None)),
1368        );
1369        assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
1370    }
1371
1372    #[test]
1373    fn test_multiply() {
1374        let precision1 = Precision::Exact(6);
1375        let precision2 = Precision::Inexact(3);
1376        let precision3 = Precision::Exact(5);
1377        let precision_max_exact = Precision::Exact(usize::MAX);
1378        let precision_max_inexact = Precision::Exact(usize::MAX);
1379        let absent_precision = Precision::Absent;
1380
1381        assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
1382        assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
1383        assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
1384        assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
1385        assert_eq!(
1386            precision_max_exact.multiply(&precision1),
1387            Precision::Inexact(usize::MAX)
1388        );
1389        assert_eq!(
1390            precision_max_inexact.multiply(&precision1),
1391            Precision::Inexact(usize::MAX)
1392        );
1393    }
1394
1395    #[test]
1396    fn test_multiply_scalar() {
1397        let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
1398
1399        assert_eq!(
1400            precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
1401            Precision::Exact(ScalarValue::Int32(Some(30))),
1402        );
1403        assert_eq!(
1404            precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
1405            Precision::Inexact(ScalarValue::Int32(Some(30))),
1406        );
1407        assert_eq!(
1408            precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
1409            // As per behavior of ScalarValue::mul_checked
1410            Precision::Exact(ScalarValue::Int32(None)),
1411        );
1412        assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
1413    }
1414
1415    #[test]
1416    fn test_cast_to() {
1417        // Valid
1418        assert_eq!(
1419            Precision::Exact(ScalarValue::Int32(Some(42)))
1420                .cast_to(&DataType::Int64)
1421                .unwrap(),
1422            Precision::Exact(ScalarValue::Int64(Some(42))),
1423        );
1424        assert_eq!(
1425            Precision::Inexact(ScalarValue::Int32(Some(42)))
1426                .cast_to(&DataType::Int64)
1427                .unwrap(),
1428            Precision::Inexact(ScalarValue::Int64(Some(42))),
1429        );
1430        // Null
1431        assert_eq!(
1432            Precision::Exact(ScalarValue::Int32(None))
1433                .cast_to(&DataType::Int64)
1434                .unwrap(),
1435            Precision::Exact(ScalarValue::Int64(None)),
1436        );
1437        // Overflow returns error
1438        assert!(
1439            Precision::Exact(ScalarValue::Int32(Some(256)))
1440                .cast_to(&DataType::Int8)
1441                .is_err()
1442        );
1443    }
1444
1445    #[test]
1446    fn test_precision_cloning() {
1447        // Precision<usize> is copy
1448        let precision: Precision<usize> = Precision::Exact(42);
1449        let p2 = precision;
1450        assert_eq!(precision, p2);
1451
1452        // Precision<ScalarValue> is not copy (requires .clone())
1453        let precision: Precision<ScalarValue> =
1454            Precision::Exact(ScalarValue::Int64(Some(42)));
1455        let p2 = precision.clone();
1456        assert_eq!(precision, p2);
1457    }
1458
1459    #[test]
1460    fn test_project_none() {
1461        let projection: Option<Vec<usize>> = None;
1462        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1463        assert_eq!(stats, make_stats(vec![10, 20, 30]));
1464    }
1465
1466    #[test]
1467    fn test_project_empty() {
1468        let projection = Some(vec![]);
1469        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1470        assert_eq!(stats, make_stats(vec![]));
1471    }
1472
1473    #[test]
1474    fn test_project_swap() {
1475        let projection = Some(vec![2, 1]);
1476        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1477        assert_eq!(stats, make_stats(vec![30, 20]));
1478    }
1479
1480    #[test]
1481    fn test_project_repeated() {
1482        let projection = Some(vec![1, 2, 1, 1, 0, 2]);
1483        let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
1484        assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
1485    }
1486
1487    // Make a Statistics structure with the specified null counts for each column
1488    fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
1489        Statistics {
1490            num_rows: Precision::Exact(42),
1491            total_byte_size: Precision::Exact(500),
1492            column_statistics: counts.into_iter().map(col_stats_i64).collect(),
1493        }
1494    }
1495
1496    fn col_stats_i64(null_count: usize) -> ColumnStatistics {
1497        ColumnStatistics {
1498            null_count: Precision::Exact(null_count),
1499            max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
1500            min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
1501            sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
1502            distinct_count: Precision::Exact(100),
1503            byte_size: Precision::Exact(800),
1504        }
1505    }
1506
1507    fn make_single_i64_ndv_stats(
1508        distinct_count: Precision<usize>,
1509        min_value: Option<i64>,
1510        max_value: Option<i64>,
1511    ) -> Statistics {
1512        let to_precision = |value| Precision::Exact(ScalarValue::Int64(Some(value)));
1513
1514        Statistics::default()
1515            .with_num_rows(Precision::Exact(10))
1516            .add_column_statistics(
1517                ColumnStatistics::new_unknown()
1518                    .with_distinct_count(distinct_count)
1519                    .with_min_value(
1520                        min_value.map(to_precision).unwrap_or(Precision::Absent),
1521                    )
1522                    .with_max_value(
1523                        max_value.map(to_precision).unwrap_or(Precision::Absent),
1524                    ),
1525            )
1526    }
1527
1528    fn merge_single_i64_ndv_distinct_count(
1529        left: Statistics,
1530        right: Statistics,
1531        ndv_fallback: NdvFallback,
1532    ) -> Precision<usize> {
1533        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
1534
1535        Statistics::try_merge_iter_with_ndv_fallback(
1536            [&left, &right],
1537            &schema,
1538            ndv_fallback,
1539        )
1540        .unwrap()
1541        .column_statistics[0]
1542            .distinct_count
1543    }
1544
1545    #[test]
1546    fn test_try_merge() {
1547        // Create a schema with two columns
1548        let schema = Arc::new(Schema::new(vec![
1549            Field::new("col1", DataType::Int32, false),
1550            Field::new("col2", DataType::Int32, false),
1551        ]));
1552
1553        // Create items with statistics
1554        let stats1 = Statistics {
1555            num_rows: Precision::Exact(10),
1556            total_byte_size: Precision::Exact(100),
1557            column_statistics: vec![
1558                ColumnStatistics {
1559                    null_count: Precision::Exact(1),
1560                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1561                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
1562                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1563                    distinct_count: Precision::Absent,
1564                    byte_size: Precision::Exact(40),
1565                },
1566                ColumnStatistics {
1567                    null_count: Precision::Exact(2),
1568                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
1569                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
1570                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
1571                    distinct_count: Precision::Absent,
1572                    byte_size: Precision::Exact(40),
1573                },
1574            ],
1575        };
1576
1577        let stats2 = Statistics {
1578            num_rows: Precision::Exact(15),
1579            total_byte_size: Precision::Exact(150),
1580            column_statistics: vec![
1581                ColumnStatistics {
1582                    null_count: Precision::Exact(2),
1583                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
1584                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1585                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
1586                    distinct_count: Precision::Absent,
1587                    byte_size: Precision::Exact(60),
1588                },
1589                ColumnStatistics {
1590                    null_count: Precision::Exact(3),
1591                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
1592                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
1593                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
1594                    distinct_count: Precision::Absent,
1595                    byte_size: Precision::Exact(60),
1596                },
1597            ],
1598        };
1599
1600        let items = vec![stats1, stats2];
1601
1602        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1603
1604        // Verify the results
1605        assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
1606        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 100 + 150
1607
1608        // Verify column statistics
1609        let col1_stats = &summary_stats.column_statistics[0];
1610        assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
1611        assert_eq!(
1612            col1_stats.max_value,
1613            Precision::Exact(ScalarValue::Int32(Some(120)))
1614        );
1615        assert_eq!(
1616            col1_stats.min_value,
1617            Precision::Exact(ScalarValue::Int32(Some(-10)))
1618        );
1619        assert_eq!(
1620            col1_stats.sum_value,
1621            Precision::Exact(ScalarValue::Int64(Some(1100)))
1622        ); // 500 + 600
1623
1624        let col2_stats = &summary_stats.column_statistics[1];
1625        assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
1626        assert_eq!(
1627            col2_stats.max_value,
1628            Precision::Exact(ScalarValue::Int32(Some(200)))
1629        );
1630        assert_eq!(
1631            col2_stats.min_value,
1632            Precision::Exact(ScalarValue::Int32(Some(5)))
1633        );
1634        assert_eq!(
1635            col2_stats.sum_value,
1636            Precision::Exact(ScalarValue::Int64(Some(2200)))
1637        ); // 1000 + 1200
1638    }
1639
1640    #[test]
1641    fn test_try_merge_mixed_precision() {
1642        // Create a schema with one column
1643        let schema = Arc::new(Schema::new(vec![Field::new(
1644            "col1",
1645            DataType::Int32,
1646            false,
1647        )]));
1648
1649        // Create items with different precision levels
1650        let stats1 = Statistics {
1651            num_rows: Precision::Exact(10),
1652            total_byte_size: Precision::Inexact(100),
1653            column_statistics: vec![ColumnStatistics {
1654                null_count: Precision::Exact(1),
1655                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
1656                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1657                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
1658                distinct_count: Precision::Absent,
1659                byte_size: Precision::Exact(40),
1660            }],
1661        };
1662
1663        let stats2 = Statistics {
1664            num_rows: Precision::Inexact(15),
1665            total_byte_size: Precision::Exact(150),
1666            column_statistics: vec![ColumnStatistics {
1667                null_count: Precision::Inexact(2),
1668                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
1669                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
1670                sum_value: Precision::Absent,
1671                distinct_count: Precision::Absent,
1672                byte_size: Precision::Inexact(60),
1673            }],
1674        };
1675
1676        let items = vec![stats1, stats2];
1677
1678        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1679
1680        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
1681        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
1682
1683        let col_stats = &summary_stats.column_statistics[0];
1684        assert_eq!(col_stats.null_count, Precision::Inexact(3));
1685        assert_eq!(
1686            col_stats.max_value,
1687            Precision::Inexact(ScalarValue::Int32(Some(120)))
1688        );
1689        assert_eq!(
1690            col_stats.min_value,
1691            Precision::Inexact(ScalarValue::Int32(Some(-10)))
1692        );
1693        assert_eq!(col_stats.sum_value, Precision::Absent);
1694    }
1695
1696    #[test]
1697    fn test_try_merge_empty() {
1698        let schema = Arc::new(Schema::new(vec![Field::new(
1699            "col1",
1700            DataType::Int32,
1701            false,
1702        )]));
1703
1704        // Empty collection
1705        let items: Vec<Statistics> = vec![];
1706
1707        let summary_stats = Statistics::try_merge_iter(&items, &schema).unwrap();
1708
1709        // Verify default values for empty collection
1710        assert_eq!(summary_stats.num_rows, Precision::Absent);
1711        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
1712        assert_eq!(summary_stats.column_statistics.len(), 1);
1713        assert_eq!(
1714            summary_stats.column_statistics[0].null_count,
1715            Precision::Absent
1716        );
1717    }
1718
1719    #[test]
1720    fn test_try_merge_mismatched_size() {
1721        // Create a schema with one column
1722        let schema = Arc::new(Schema::new(vec![Field::new(
1723            "col1",
1724            DataType::Int32,
1725            false,
1726        )]));
1727
1728        // No column statistics
1729        let stats1 = Statistics::default();
1730
1731        let stats2 =
1732            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
1733
1734        let items = vec![stats1, stats2];
1735
1736        let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
1737        assert_contains!(
1738            e.to_string(),
1739            "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1"
1740        );
1741    }
1742
1743    #[test]
1744    fn test_try_merge_distinct_count_absent() {
1745        // Create statistics with known distinct counts
1746        let stats1 = Statistics::default()
1747            .with_num_rows(Precision::Exact(10))
1748            .with_total_byte_size(Precision::Exact(100))
1749            .add_column_statistics(
1750                ColumnStatistics::new_unknown()
1751                    .with_null_count(Precision::Exact(0))
1752                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
1753                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
1754                    .with_distinct_count(Precision::Exact(5)),
1755            );
1756
1757        let stats2 = Statistics::default()
1758            .with_num_rows(Precision::Exact(15))
1759            .with_total_byte_size(Precision::Exact(150))
1760            .add_column_statistics(
1761                ColumnStatistics::new_unknown()
1762                    .with_null_count(Precision::Exact(0))
1763                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1764                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(20))))
1765                    .with_distinct_count(Precision::Exact(7)),
1766            );
1767
1768        // Merge statistics
1769        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
1770        let merged_stats =
1771            Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1772
1773        // Verify the results
1774        assert_eq!(merged_stats.num_rows, Precision::Exact(25));
1775        assert_eq!(merged_stats.total_byte_size, Precision::Exact(250));
1776
1777        let col_stats = &merged_stats.column_statistics[0];
1778        assert_eq!(col_stats.null_count, Precision::Exact(0));
1779        assert_eq!(
1780            col_stats.min_value,
1781            Precision::Exact(ScalarValue::Int32(Some(1)))
1782        );
1783        assert_eq!(
1784            col_stats.max_value,
1785            Precision::Exact(ScalarValue::Int32(Some(20)))
1786        );
1787        // Overlap-based NDV: ranges [1,10] and [5,20], overlap [5,10]
1788        // range_left=9, range_right=15, overlap=5
1789        // overlap_left=5*(5/9)=2.78, overlap_right=7*(5/15)=2.33
1790        // result = max(2.78, 2.33) + (5-2.78) + (7-2.33) = 9.67 -> 10
1791        assert_eq!(col_stats.distinct_count, Precision::Inexact(10));
1792    }
1793
1794    #[test]
1795    fn test_try_merge_ndv_disjoint_ranges() {
1796        let stats1 = Statistics::default()
1797            .with_num_rows(Precision::Exact(10))
1798            .add_column_statistics(
1799                ColumnStatistics::new_unknown()
1800                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
1801                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
1802                    .with_distinct_count(Precision::Exact(5)),
1803            );
1804        let stats2 = Statistics::default()
1805            .with_num_rows(Precision::Exact(10))
1806            .add_column_statistics(
1807                ColumnStatistics::new_unknown()
1808                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(20))))
1809                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(30))))
1810                    .with_distinct_count(Precision::Exact(8)),
1811            );
1812
1813        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
1814        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1815        // No overlap -> sum of NDVs
1816        assert_eq!(
1817            merged.column_statistics[0].distinct_count,
1818            Precision::Inexact(13)
1819        );
1820    }
1821
1822    #[test]
1823    fn test_try_merge_ndv_identical_ranges() {
1824        let stats1 = Statistics::default()
1825            .with_num_rows(Precision::Exact(100))
1826            .add_column_statistics(
1827                ColumnStatistics::new_unknown()
1828                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
1829                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
1830                    .with_distinct_count(Precision::Exact(50)),
1831            );
1832        let stats2 = Statistics::default()
1833            .with_num_rows(Precision::Exact(100))
1834            .add_column_statistics(
1835                ColumnStatistics::new_unknown()
1836                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
1837                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
1838                    .with_distinct_count(Precision::Exact(30)),
1839            );
1840
1841        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
1842        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1843        // Full overlap -> max(50, 30) = 50
1844        assert_eq!(
1845            merged.column_statistics[0].distinct_count,
1846            Precision::Inexact(50)
1847        );
1848    }
1849
1850    #[test]
1851    fn test_try_merge_ndv_partial_overlap() {
1852        let stats1 = Statistics::default()
1853            .with_num_rows(Precision::Exact(100))
1854            .add_column_statistics(
1855                ColumnStatistics::new_unknown()
1856                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
1857                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
1858                    .with_distinct_count(Precision::Exact(80)),
1859            );
1860        let stats2 = Statistics::default()
1861            .with_num_rows(Precision::Exact(100))
1862            .add_column_statistics(
1863                ColumnStatistics::new_unknown()
1864                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(50))))
1865                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(150))))
1866                    .with_distinct_count(Precision::Exact(60)),
1867            );
1868
1869        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
1870        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1871        // overlap=[50,100], range_left=100, range_right=100, overlap_range=50
1872        // overlap_left=80*(50/100)=40, overlap_right=60*(50/100)=30
1873        // result = max(40,30) + (80-40) + (60-30) = 40 + 40 + 30 = 110
1874        assert_eq!(
1875            merged.column_statistics[0].distinct_count,
1876            Precision::Inexact(110)
1877        );
1878    }
1879
1880    #[test]
1881    fn test_try_merge_ndv_missing_min_max() {
1882        let stats1 = Statistics::default()
1883            .with_num_rows(Precision::Exact(10))
1884            .add_column_statistics(
1885                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(5)),
1886            );
1887        let stats2 = Statistics::default()
1888            .with_num_rows(Precision::Exact(10))
1889            .add_column_statistics(
1890                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(8)),
1891            );
1892
1893        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
1894        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1895        // No min/max -> default fallback is max
1896        assert_eq!(
1897            merged.column_statistics[0].distinct_count,
1898            Precision::Inexact(8)
1899        );
1900    }
1901
1902    #[test]
1903    fn test_try_merge_ndv_non_numeric_types() {
1904        let stats1 = Statistics::default()
1905            .with_num_rows(Precision::Exact(10))
1906            .add_column_statistics(
1907                ColumnStatistics::new_unknown()
1908                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
1909                        "aaa".to_string(),
1910                    ))))
1911                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
1912                        "zzz".to_string(),
1913                    ))))
1914                    .with_distinct_count(Precision::Exact(5)),
1915            );
1916        let stats2 = Statistics::default()
1917            .with_num_rows(Precision::Exact(10))
1918            .add_column_statistics(
1919                ColumnStatistics::new_unknown()
1920                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
1921                        "bbb".to_string(),
1922                    ))))
1923                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
1924                        "yyy".to_string(),
1925                    ))))
1926                    .with_distinct_count(Precision::Exact(8)),
1927            );
1928
1929        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
1930        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
1931        // distance() unsupported for strings -> default fallback is max
1932        assert_eq!(
1933            merged.column_statistics[0].distinct_count,
1934            Precision::Inexact(8)
1935        );
1936    }
1937
1938    #[test]
1939    fn test_try_merge_ndv_non_numeric_types_sum_fallback() {
1940        let stats1 = Statistics::default()
1941            .with_num_rows(Precision::Exact(10))
1942            .add_column_statistics(
1943                ColumnStatistics::new_unknown()
1944                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
1945                        "aaa".to_string(),
1946                    ))))
1947                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
1948                        "zzz".to_string(),
1949                    ))))
1950                    .with_distinct_count(Precision::Exact(5)),
1951            );
1952        let stats2 = Statistics::default()
1953            .with_num_rows(Precision::Exact(10))
1954            .add_column_statistics(
1955                ColumnStatistics::new_unknown()
1956                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
1957                        "bbb".to_string(),
1958                    ))))
1959                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
1960                        "yyy".to_string(),
1961                    ))))
1962                    .with_distinct_count(Precision::Exact(8)),
1963            );
1964
1965        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
1966        let merged = Statistics::try_merge_iter_with_ndv_fallback(
1967            [&stats1, &stats2],
1968            &schema,
1969            NdvFallback::Sum,
1970        )
1971        .unwrap();
1972
1973        // distance() unsupported for strings -> sum fallback is caller-selected
1974        assert_eq!(
1975            merged.column_statistics[0].distinct_count,
1976            Precision::Inexact(13)
1977        );
1978    }
1979
1980    #[test]
1981    fn test_try_merge_ndv_constant_columns() {
1982        // Same constant: [5,5]+[5,5] -> max
1983        let stats1 = Statistics::default()
1984            .with_num_rows(Precision::Exact(10))
1985            .add_column_statistics(
1986                ColumnStatistics::new_unknown()
1987                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1988                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1989                    .with_distinct_count(Precision::Exact(1)),
1990            );
1991        let stats2 = Statistics::default()
1992            .with_num_rows(Precision::Exact(10))
1993            .add_column_statistics(
1994                ColumnStatistics::new_unknown()
1995                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1996                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
1997                    .with_distinct_count(Precision::Exact(1)),
1998            );
1999
2000        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
2001        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
2002        assert_eq!(
2003            merged.column_statistics[0].distinct_count,
2004            Precision::Inexact(1)
2005        );
2006
2007        // Different constants: [5,5]+[10,10] -> sum
2008        let stats3 = Statistics::default()
2009            .with_num_rows(Precision::Exact(10))
2010            .add_column_statistics(
2011                ColumnStatistics::new_unknown()
2012                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
2013                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
2014                    .with_distinct_count(Precision::Exact(1)),
2015            );
2016        let stats4 = Statistics::default()
2017            .with_num_rows(Precision::Exact(10))
2018            .add_column_statistics(
2019                ColumnStatistics::new_unknown()
2020                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(10))))
2021                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
2022                    .with_distinct_count(Precision::Exact(1)),
2023            );
2024
2025        let merged = Statistics::try_merge_iter([&stats3, &stats4], &schema).unwrap();
2026        assert_eq!(
2027            merged.column_statistics[0].distinct_count,
2028            Precision::Inexact(2)
2029        );
2030    }
2031
2032    #[test]
2033    fn test_try_merge_ndv_original_union_edge_cases() {
2034        struct NdvTestCase {
2035            name: &'static str,
2036            left_ndv: Precision<usize>,
2037            left_min: Option<i64>,
2038            left_max: Option<i64>,
2039            right_ndv: Precision<usize>,
2040            right_min: Option<i64>,
2041            right_max: Option<i64>,
2042            expected: Precision<usize>,
2043        }
2044
2045        let cases = vec![
2046            NdvTestCase {
2047                name: "disjoint ranges",
2048                left_ndv: Precision::Exact(5),
2049                left_min: Some(0),
2050                left_max: Some(10),
2051                right_ndv: Precision::Exact(3),
2052                right_min: Some(20),
2053                right_max: Some(30),
2054                expected: Precision::Inexact(8),
2055            },
2056            NdvTestCase {
2057                name: "identical ranges",
2058                left_ndv: Precision::Exact(10),
2059                left_min: Some(0),
2060                left_max: Some(100),
2061                right_ndv: Precision::Exact(8),
2062                right_min: Some(0),
2063                right_max: Some(100),
2064                expected: Precision::Inexact(10),
2065            },
2066            NdvTestCase {
2067                name: "partial overlap",
2068                left_ndv: Precision::Exact(100),
2069                left_min: Some(0),
2070                left_max: Some(100),
2071                right_ndv: Precision::Exact(50),
2072                right_min: Some(50),
2073                right_max: Some(150),
2074                expected: Precision::Inexact(125),
2075            },
2076            NdvTestCase {
2077                name: "right contained in left",
2078                left_ndv: Precision::Exact(100),
2079                left_min: Some(0),
2080                left_max: Some(100),
2081                right_ndv: Precision::Exact(50),
2082                right_min: Some(25),
2083                right_max: Some(75),
2084                expected: Precision::Inexact(100),
2085            },
2086            NdvTestCase {
2087                name: "same constant value",
2088                left_ndv: Precision::Exact(1),
2089                left_min: Some(5),
2090                left_max: Some(5),
2091                right_ndv: Precision::Exact(1),
2092                right_min: Some(5),
2093                right_max: Some(5),
2094                expected: Precision::Inexact(1),
2095            },
2096            NdvTestCase {
2097                name: "different constant values",
2098                left_ndv: Precision::Exact(1),
2099                left_min: Some(5),
2100                left_max: Some(5),
2101                right_ndv: Precision::Exact(1),
2102                right_min: Some(10),
2103                right_max: Some(10),
2104                expected: Precision::Inexact(2),
2105            },
2106            NdvTestCase {
2107                name: "left constant within right range",
2108                left_ndv: Precision::Exact(1),
2109                left_min: Some(5),
2110                left_max: Some(5),
2111                right_ndv: Precision::Exact(10),
2112                right_min: Some(0),
2113                right_max: Some(10),
2114                expected: Precision::Inexact(10),
2115            },
2116            NdvTestCase {
2117                name: "left constant outside right range",
2118                left_ndv: Precision::Exact(1),
2119                left_min: Some(20),
2120                left_max: Some(20),
2121                right_ndv: Precision::Exact(10),
2122                right_min: Some(0),
2123                right_max: Some(10),
2124                expected: Precision::Inexact(11),
2125            },
2126            NdvTestCase {
2127                name: "right constant within left range",
2128                left_ndv: Precision::Exact(10),
2129                left_min: Some(0),
2130                left_max: Some(10),
2131                right_ndv: Precision::Exact(1),
2132                right_min: Some(5),
2133                right_max: Some(5),
2134                expected: Precision::Inexact(10),
2135            },
2136            NdvTestCase {
2137                name: "right constant outside left range",
2138                left_ndv: Precision::Exact(10),
2139                left_min: Some(0),
2140                left_max: Some(10),
2141                right_ndv: Precision::Exact(1),
2142                right_min: Some(20),
2143                right_max: Some(20),
2144                expected: Precision::Inexact(11),
2145            },
2146            NdvTestCase {
2147                name: "missing bounds exact plus exact",
2148                left_ndv: Precision::Exact(10),
2149                left_min: None,
2150                left_max: None,
2151                right_ndv: Precision::Exact(5),
2152                right_min: None,
2153                right_max: None,
2154                expected: Precision::Inexact(15),
2155            },
2156            NdvTestCase {
2157                name: "missing bounds exact plus inexact",
2158                left_ndv: Precision::Exact(10),
2159                left_min: None,
2160                left_max: None,
2161                right_ndv: Precision::Inexact(5),
2162                right_min: None,
2163                right_max: None,
2164                expected: Precision::Inexact(15),
2165            },
2166            NdvTestCase {
2167                name: "missing bounds inexact plus inexact",
2168                left_ndv: Precision::Inexact(7),
2169                left_min: None,
2170                left_max: None,
2171                right_ndv: Precision::Inexact(3),
2172                right_min: None,
2173                right_max: None,
2174                expected: Precision::Inexact(10),
2175            },
2176            NdvTestCase {
2177                name: "exact plus absent",
2178                left_ndv: Precision::Exact(10),
2179                left_min: None,
2180                left_max: None,
2181                right_ndv: Precision::Absent,
2182                right_min: None,
2183                right_max: None,
2184                expected: Precision::Absent,
2185            },
2186            NdvTestCase {
2187                name: "inexact plus absent",
2188                left_ndv: Precision::Inexact(4),
2189                left_min: None,
2190                left_max: None,
2191                right_ndv: Precision::Absent,
2192                right_min: None,
2193                right_max: None,
2194                expected: Precision::Absent,
2195            },
2196        ];
2197
2198        for case in cases {
2199            let actual = merge_single_i64_ndv_distinct_count(
2200                make_single_i64_ndv_stats(case.left_ndv, case.left_min, case.left_max),
2201                make_single_i64_ndv_stats(case.right_ndv, case.right_min, case.right_max),
2202                NdvFallback::Sum,
2203            );
2204
2205            assert_eq!(actual, case.expected, "case {} failed", case.name);
2206        }
2207    }
2208
2209    #[test]
2210    fn test_with_fetch_basic_preservation() {
2211        // Test that column statistics and byte size are preserved (as inexact) when applying fetch
2212        let original_stats = Statistics {
2213            num_rows: Precision::Exact(1000),
2214            total_byte_size: Precision::Exact(8000),
2215            column_statistics: vec![
2216                ColumnStatistics {
2217                    null_count: Precision::Exact(10),
2218                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
2219                    min_value: Precision::Exact(ScalarValue::Int32(Some(0))),
2220                    sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))),
2221                    distinct_count: Precision::Exact(50),
2222                    byte_size: Precision::Exact(4000),
2223                },
2224                ColumnStatistics {
2225                    null_count: Precision::Exact(20),
2226                    max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
2227                    min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
2228                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))),
2229                    distinct_count: Precision::Exact(75),
2230                    byte_size: Precision::Exact(8000),
2231                },
2232            ],
2233        };
2234
2235        // Apply fetch of 100 rows (10% of original)
2236        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
2237
2238        // Check num_rows
2239        assert_eq!(result.num_rows, Precision::Exact(100));
2240
2241        // Check total_byte_size is computed as sum of scaled column byte_size values
2242        // Column 1: 4000 * 0.1 = 400, Column 2: 8000 * 0.1 = 800, Sum = 1200
2243        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
2244
2245        // Check column statistics are preserved but marked as inexact
2246        assert_eq!(result.column_statistics.len(), 2);
2247
2248        // First column
2249        assert_eq!(
2250            result.column_statistics[0].null_count,
2251            Precision::Inexact(10)
2252        );
2253        assert_eq!(
2254            result.column_statistics[0].max_value,
2255            Precision::Inexact(ScalarValue::Int32(Some(100)))
2256        );
2257        assert_eq!(
2258            result.column_statistics[0].min_value,
2259            Precision::Inexact(ScalarValue::Int32(Some(0)))
2260        );
2261        assert_eq!(
2262            result.column_statistics[0].sum_value,
2263            Precision::Inexact(ScalarValue::Int32(Some(5050)))
2264        );
2265        assert_eq!(
2266            result.column_statistics[0].distinct_count,
2267            Precision::Inexact(50)
2268        );
2269
2270        // Second column
2271        assert_eq!(
2272            result.column_statistics[1].null_count,
2273            Precision::Inexact(20)
2274        );
2275        assert_eq!(
2276            result.column_statistics[1].max_value,
2277            Precision::Inexact(ScalarValue::Int64(Some(200)))
2278        );
2279        assert_eq!(
2280            result.column_statistics[1].min_value,
2281            Precision::Inexact(ScalarValue::Int64(Some(10)))
2282        );
2283        assert_eq!(
2284            result.column_statistics[1].sum_value,
2285            Precision::Inexact(ScalarValue::Int64(Some(10100)))
2286        );
2287        assert_eq!(
2288            result.column_statistics[1].distinct_count,
2289            Precision::Inexact(75)
2290        );
2291    }
2292
2293    #[test]
2294    fn test_with_fetch_inexact_input() {
2295        // Test that inexact input statistics remain inexact
2296        let original_stats = Statistics {
2297            num_rows: Precision::Inexact(1000),
2298            total_byte_size: Precision::Inexact(8000),
2299            column_statistics: vec![ColumnStatistics {
2300                null_count: Precision::Inexact(10),
2301                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
2302                min_value: Precision::Inexact(ScalarValue::Int32(Some(0))),
2303                sum_value: Precision::Inexact(ScalarValue::Int32(Some(5050))),
2304                distinct_count: Precision::Inexact(50),
2305                byte_size: Precision::Inexact(4000),
2306            }],
2307        };
2308
2309        let result = original_stats.clone().with_fetch(Some(500), 0, 1).unwrap();
2310
2311        // Check num_rows is inexact
2312        assert_eq!(result.num_rows, Precision::Inexact(500));
2313
2314        // Check total_byte_size is computed as sum of scaled column byte_size values
2315        // Column 1: 4000 * 0.5 = 2000, Sum = 2000
2316        assert_eq!(result.total_byte_size, Precision::Inexact(2000));
2317
2318        // Column stats remain inexact
2319        assert_eq!(
2320            result.column_statistics[0].null_count,
2321            Precision::Inexact(10)
2322        );
2323    }
2324
2325    #[test]
2326    fn test_with_fetch_skip_all_rows() {
2327        // Test when skip >= num_rows (all rows are skipped)
2328        let original_stats = Statistics {
2329            num_rows: Precision::Exact(100),
2330            total_byte_size: Precision::Exact(800),
2331            column_statistics: vec![col_stats_i64(10)],
2332        };
2333
2334        let result = original_stats.clone().with_fetch(Some(50), 100, 1).unwrap();
2335
2336        assert_eq!(result.num_rows, Precision::Exact(0));
2337        // When ratio is 0/100 = 0, byte size should be 0
2338        assert_eq!(result.total_byte_size, Precision::Inexact(0));
2339    }
2340
2341    #[test]
2342    fn test_with_fetch_skip_all_rows_inexact() {
2343        // When the input num_rows is Inexact (an upper-bound estimate), an
2344        // `nr <= skip` outcome must remain Inexact: the estimate could be
2345        // wrong, so we cannot promote 0 to Exact.
2346        let original_stats = Statistics {
2347            num_rows: Precision::Inexact(0),
2348            total_byte_size: Precision::Inexact(0),
2349            column_statistics: vec![col_stats_i64(10)],
2350        };
2351
2352        let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();
2353
2354        assert_eq!(result.num_rows, Precision::Inexact(0));
2355    }
2356
2357    #[test]
2358    fn test_with_fetch_no_limit() {
2359        // Test when fetch is None and skip is 0 (no limit applied)
2360        let original_stats = Statistics {
2361            num_rows: Precision::Exact(100),
2362            total_byte_size: Precision::Exact(800),
2363            column_statistics: vec![col_stats_i64(10)],
2364        };
2365
2366        let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();
2367
2368        // Stats should be unchanged when no fetch and no skip
2369        assert_eq!(result.num_rows, Precision::Exact(100));
2370        assert_eq!(result.total_byte_size, Precision::Exact(800));
2371    }
2372
2373    #[test]
2374    fn test_with_fetch_with_skip() {
2375        // Test with both skip and fetch
2376        let original_stats = Statistics {
2377            num_rows: Precision::Exact(1000),
2378            total_byte_size: Precision::Exact(8000),
2379            column_statistics: vec![col_stats_i64(10)],
2380        };
2381
2382        // Skip 200, fetch 300, so we get rows 200-500
2383        let result = original_stats
2384            .clone()
2385            .with_fetch(Some(300), 200, 1)
2386            .unwrap();
2387
2388        assert_eq!(result.num_rows, Precision::Exact(300));
2389        // Column 1: byte_size 800 * (300/500) = 240, Sum = 240
2390        assert_eq!(result.total_byte_size, Precision::Inexact(240));
2391    }
2392
2393    #[test]
2394    fn test_with_fetch_multi_partition() {
2395        // Test with multiple partitions
2396        let original_stats = Statistics {
2397            num_rows: Precision::Exact(1000), // per partition
2398            total_byte_size: Precision::Exact(8000),
2399            column_statistics: vec![col_stats_i64(10)],
2400        };
2401
2402        // Fetch 100 per partition, 4 partitions = 400 total
2403        let result = original_stats.clone().with_fetch(Some(100), 0, 4).unwrap();
2404
2405        assert_eq!(result.num_rows, Precision::Exact(400));
2406        // Column 1: byte_size 800 * 0.4 = 320, Sum = 320
2407        assert_eq!(result.total_byte_size, Precision::Inexact(320));
2408    }
2409
2410    #[test]
2411    fn test_with_fetch_absent_stats() {
2412        // Test with absent statistics
2413        let original_stats = Statistics {
2414            num_rows: Precision::Absent,
2415            total_byte_size: Precision::Absent,
2416            column_statistics: vec![ColumnStatistics {
2417                null_count: Precision::Absent,
2418                max_value: Precision::Absent,
2419                min_value: Precision::Absent,
2420                sum_value: Precision::Absent,
2421                distinct_count: Precision::Absent,
2422                byte_size: Precision::Absent,
2423            }],
2424        };
2425
2426        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
2427
2428        // With absent input stats, output should be inexact estimate
2429        assert_eq!(result.num_rows, Precision::Inexact(100));
2430        assert_eq!(result.total_byte_size, Precision::Absent);
2431        // Column stats should remain absent
2432        assert_eq!(result.column_statistics[0].null_count, Precision::Absent);
2433    }
2434
2435    #[test]
2436    fn test_with_fetch_fetch_exceeds_rows() {
2437        // Test when fetch is larger than available rows after skip
2438        let original_stats = Statistics {
2439            num_rows: Precision::Exact(100),
2440            total_byte_size: Precision::Exact(800),
2441            column_statistics: vec![col_stats_i64(10)],
2442        };
2443
2444        // Skip 50, fetch 100, but only 50 rows remain
2445        let result = original_stats.clone().with_fetch(Some(100), 50, 1).unwrap();
2446
2447        assert_eq!(result.num_rows, Precision::Exact(50));
2448        // 50/100 = 0.5, so 800 * 0.5 = 400
2449        assert_eq!(result.total_byte_size, Precision::Inexact(400));
2450    }
2451
2452    #[test]
2453    fn test_with_fetch_preserves_all_column_stats() {
2454        // Comprehensive test that all column statistic fields are preserved
2455        let original_col_stats = ColumnStatistics {
2456            null_count: Precision::Exact(42),
2457            max_value: Precision::Exact(ScalarValue::Int32(Some(999))),
2458            min_value: Precision::Exact(ScalarValue::Int32(Some(-100))),
2459            sum_value: Precision::Exact(ScalarValue::Int32(Some(123456))),
2460            distinct_count: Precision::Exact(789),
2461            byte_size: Precision::Exact(4000),
2462        };
2463
2464        let original_stats = Statistics {
2465            num_rows: Precision::Exact(1000),
2466            total_byte_size: Precision::Exact(8000),
2467            column_statistics: vec![original_col_stats.clone()],
2468        };
2469
2470        let result = original_stats.with_fetch(Some(250), 0, 1).unwrap();
2471
2472        let result_col_stats = &result.column_statistics[0];
2473
2474        // All values should be preserved but marked as inexact
2475        assert_eq!(result_col_stats.null_count, Precision::Inexact(42));
2476        assert_eq!(
2477            result_col_stats.max_value,
2478            Precision::Inexact(ScalarValue::Int32(Some(999)))
2479        );
2480        assert_eq!(
2481            result_col_stats.min_value,
2482            Precision::Inexact(ScalarValue::Int32(Some(-100)))
2483        );
2484        assert_eq!(
2485            result_col_stats.sum_value,
2486            Precision::Inexact(ScalarValue::Int32(Some(123456)))
2487        );
2488        // NDV is capped at the new row count (250) since 789 > 250
2489        assert_eq!(result_col_stats.distinct_count, Precision::Inexact(250));
2490    }
2491
2492    #[test]
2493    fn test_byte_size_to_inexact() {
2494        let col_stats = ColumnStatistics {
2495            null_count: Precision::Exact(10),
2496            max_value: Precision::Absent,
2497            min_value: Precision::Absent,
2498            sum_value: Precision::Absent,
2499            distinct_count: Precision::Absent,
2500            byte_size: Precision::Exact(5000),
2501        };
2502
2503        let inexact = col_stats.to_inexact();
2504        assert_eq!(inexact.byte_size, Precision::Inexact(5000));
2505    }
2506
2507    #[test]
2508    fn test_with_byte_size_builder() {
2509        let col_stats =
2510            ColumnStatistics::new_unknown().with_byte_size(Precision::Exact(8192));
2511        assert_eq!(col_stats.byte_size, Precision::Exact(8192));
2512    }
2513
2514    #[test]
2515    fn test_with_sum_value_builder_widens_small_integers() {
2516        let col_stats = ColumnStatistics::new_unknown()
2517            .with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(123))));
2518        assert_eq!(
2519            col_stats.sum_value,
2520            Precision::Exact(ScalarValue::UInt64(Some(123)))
2521        );
2522    }
2523
2524    #[test]
2525    fn test_with_fetch_scales_byte_size() {
2526        // Test that byte_size is scaled by the row ratio in with_fetch
2527        let original_stats = Statistics {
2528            num_rows: Precision::Exact(1000),
2529            total_byte_size: Precision::Exact(8000),
2530            column_statistics: vec![
2531                ColumnStatistics {
2532                    null_count: Precision::Exact(10),
2533                    max_value: Precision::Absent,
2534                    min_value: Precision::Absent,
2535                    sum_value: Precision::Absent,
2536                    distinct_count: Precision::Absent,
2537                    byte_size: Precision::Exact(4000),
2538                },
2539                ColumnStatistics {
2540                    null_count: Precision::Exact(20),
2541                    max_value: Precision::Absent,
2542                    min_value: Precision::Absent,
2543                    sum_value: Precision::Absent,
2544                    distinct_count: Precision::Absent,
2545                    byte_size: Precision::Exact(8000),
2546                },
2547            ],
2548        };
2549
2550        // Apply fetch of 100 rows (10% of original)
2551        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
2552
2553        // byte_size should be scaled: 4000 * 0.1 = 400, 8000 * 0.1 = 800
2554        assert_eq!(
2555            result.column_statistics[0].byte_size,
2556            Precision::Inexact(400)
2557        );
2558        assert_eq!(
2559            result.column_statistics[1].byte_size,
2560            Precision::Inexact(800)
2561        );
2562
2563        // total_byte_size should be computed as sum of byte_size values: 400 + 800 = 1200
2564        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
2565    }
2566
2567    #[test]
2568    fn test_with_fetch_total_byte_size_fallback() {
2569        // Test that total_byte_size falls back to scaling when not all columns have byte_size
2570        let original_stats = Statistics {
2571            num_rows: Precision::Exact(1000),
2572            total_byte_size: Precision::Exact(8000),
2573            column_statistics: vec![
2574                ColumnStatistics {
2575                    null_count: Precision::Exact(10),
2576                    max_value: Precision::Absent,
2577                    min_value: Precision::Absent,
2578                    sum_value: Precision::Absent,
2579                    distinct_count: Precision::Absent,
2580                    byte_size: Precision::Exact(4000),
2581                },
2582                ColumnStatistics {
2583                    null_count: Precision::Exact(20),
2584                    max_value: Precision::Absent,
2585                    min_value: Precision::Absent,
2586                    sum_value: Precision::Absent,
2587                    distinct_count: Precision::Absent,
2588                    byte_size: Precision::Absent, // One column has no byte_size
2589                },
2590            ],
2591        };
2592
2593        // Apply fetch of 100 rows (10% of original)
2594        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
2595
2596        // total_byte_size should fall back to scaling: 8000 * 0.1 = 800
2597        assert_eq!(result.total_byte_size, Precision::Inexact(800));
2598    }
2599
2600    #[test]
2601    fn test_with_fetch_caps_ndv_at_row_count() {
2602        // NDV=500 but after LIMIT 10, NDV should be capped at 10
2603        let stats = Statistics {
2604            num_rows: Precision::Exact(1000),
2605            total_byte_size: Precision::Exact(8000),
2606            column_statistics: vec![ColumnStatistics {
2607                distinct_count: Precision::Inexact(500),
2608                ..Default::default()
2609            }],
2610        };
2611
2612        let result = stats.with_fetch(Some(10), 0, 1).unwrap();
2613        assert_eq!(result.num_rows, Precision::Exact(10));
2614        assert_eq!(
2615            result.column_statistics[0].distinct_count,
2616            Precision::Inexact(10)
2617        );
2618    }
2619
2620    #[test]
2621    fn test_with_fetch_caps_ndv_with_skip() {
2622        // 1000 rows, NDV=500, OFFSET 5 LIMIT 10
2623        // with_fetch computes num_rows = min(1000 - 5, 10) = 10
2624        // NDV should be capped at 10
2625        let stats = Statistics {
2626            num_rows: Precision::Exact(1000),
2627            total_byte_size: Precision::Exact(8000),
2628            column_statistics: vec![ColumnStatistics {
2629                distinct_count: Precision::Inexact(500),
2630                ..Default::default()
2631            }],
2632        };
2633
2634        let result = stats.with_fetch(Some(10), 5, 1).unwrap();
2635        assert_eq!(result.num_rows, Precision::Exact(10));
2636        assert_eq!(
2637            result.column_statistics[0].distinct_count,
2638            Precision::Inexact(10)
2639        );
2640    }
2641
2642    #[test]
2643    fn test_with_fetch_caps_ndv_with_large_skip() {
2644        // 1000 rows, NDV=500, OFFSET 995 LIMIT 100
2645        // with_fetch computes num_rows = min(1000 - 995, 100) = 5
2646        // NDV should be capped at 5
2647        let stats = Statistics {
2648            num_rows: Precision::Exact(1000),
2649            total_byte_size: Precision::Exact(8000),
2650            column_statistics: vec![ColumnStatistics {
2651                distinct_count: Precision::Inexact(500),
2652                ..Default::default()
2653            }],
2654        };
2655
2656        let result = stats.with_fetch(Some(100), 995, 1).unwrap();
2657        assert_eq!(result.num_rows, Precision::Exact(5));
2658        assert_eq!(
2659            result.column_statistics[0].distinct_count,
2660            Precision::Inexact(5)
2661        );
2662    }
2663
2664    #[test]
2665    fn test_with_fetch_ndv_below_row_count_unchanged() {
2666        // NDV=5 and LIMIT 10: NDV should stay at 5
2667        let stats = Statistics {
2668            num_rows: Precision::Exact(1000),
2669            total_byte_size: Precision::Exact(8000),
2670            column_statistics: vec![ColumnStatistics {
2671                distinct_count: Precision::Inexact(5),
2672                ..Default::default()
2673            }],
2674        };
2675
2676        let result = stats.with_fetch(Some(10), 0, 1).unwrap();
2677        assert_eq!(result.num_rows, Precision::Exact(10));
2678        assert_eq!(
2679            result.column_statistics[0].distinct_count,
2680            Precision::Inexact(5)
2681        );
2682    }
2683
2684    #[test]
2685    fn test_try_merge_iter_basic() {
2686        let schema = Arc::new(Schema::new(vec![
2687            Field::new("col1", DataType::Int32, false),
2688            Field::new("col2", DataType::Int32, false),
2689        ]));
2690
2691        let stats1 = Statistics {
2692            num_rows: Precision::Exact(10),
2693            total_byte_size: Precision::Exact(100),
2694            column_statistics: vec![
2695                ColumnStatistics {
2696                    null_count: Precision::Exact(1),
2697                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
2698                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
2699                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
2700                    distinct_count: Precision::Absent,
2701                    byte_size: Precision::Exact(40),
2702                },
2703                ColumnStatistics {
2704                    null_count: Precision::Exact(2),
2705                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
2706                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
2707                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
2708                    distinct_count: Precision::Absent,
2709                    byte_size: Precision::Exact(40),
2710                },
2711            ],
2712        };
2713
2714        let stats2 = Statistics {
2715            num_rows: Precision::Exact(15),
2716            total_byte_size: Precision::Exact(150),
2717            column_statistics: vec![
2718                ColumnStatistics {
2719                    null_count: Precision::Exact(2),
2720                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
2721                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
2722                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
2723                    distinct_count: Precision::Absent,
2724                    byte_size: Precision::Exact(60),
2725                },
2726                ColumnStatistics {
2727                    null_count: Precision::Exact(3),
2728                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
2729                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
2730                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
2731                    distinct_count: Precision::Absent,
2732                    byte_size: Precision::Exact(60),
2733                },
2734            ],
2735        };
2736
2737        let items = vec![&stats1, &stats2];
2738        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
2739
2740        assert_eq!(summary_stats.num_rows, Precision::Exact(25));
2741        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250));
2742
2743        let col1_stats = &summary_stats.column_statistics[0];
2744        assert_eq!(col1_stats.null_count, Precision::Exact(3));
2745        assert_eq!(
2746            col1_stats.max_value,
2747            Precision::Exact(ScalarValue::Int32(Some(120)))
2748        );
2749        assert_eq!(
2750            col1_stats.min_value,
2751            Precision::Exact(ScalarValue::Int32(Some(-10)))
2752        );
2753        assert_eq!(
2754            col1_stats.sum_value,
2755            Precision::Exact(ScalarValue::Int64(Some(1100)))
2756        );
2757
2758        let col2_stats = &summary_stats.column_statistics[1];
2759        assert_eq!(col2_stats.null_count, Precision::Exact(5));
2760        assert_eq!(
2761            col2_stats.max_value,
2762            Precision::Exact(ScalarValue::Int32(Some(200)))
2763        );
2764        assert_eq!(
2765            col2_stats.min_value,
2766            Precision::Exact(ScalarValue::Int32(Some(5)))
2767        );
2768        assert_eq!(
2769            col2_stats.sum_value,
2770            Precision::Exact(ScalarValue::Int64(Some(2200)))
2771        );
2772    }
2773
2774    #[test]
2775    fn test_try_merge_iter_mixed_precision() {
2776        let schema = Arc::new(Schema::new(vec![Field::new(
2777            "col1",
2778            DataType::Int32,
2779            false,
2780        )]));
2781
2782        let stats1 = Statistics {
2783            num_rows: Precision::Exact(10),
2784            total_byte_size: Precision::Inexact(100),
2785            column_statistics: vec![ColumnStatistics {
2786                null_count: Precision::Exact(1),
2787                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
2788                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
2789                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
2790                distinct_count: Precision::Absent,
2791                byte_size: Precision::Exact(40),
2792            }],
2793        };
2794
2795        let stats2 = Statistics {
2796            num_rows: Precision::Inexact(15),
2797            total_byte_size: Precision::Exact(150),
2798            column_statistics: vec![ColumnStatistics {
2799                null_count: Precision::Inexact(2),
2800                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
2801                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
2802                sum_value: Precision::Absent,
2803                distinct_count: Precision::Absent,
2804                byte_size: Precision::Inexact(60),
2805            }],
2806        };
2807
2808        let items = vec![&stats1, &stats2];
2809        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
2810
2811        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
2812        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
2813
2814        let col_stats = &summary_stats.column_statistics[0];
2815        assert_eq!(col_stats.null_count, Precision::Inexact(3));
2816        assert_eq!(
2817            col_stats.max_value,
2818            Precision::Inexact(ScalarValue::Int32(Some(120)))
2819        );
2820        assert_eq!(
2821            col_stats.min_value,
2822            Precision::Inexact(ScalarValue::Int32(Some(-10)))
2823        );
2824        // sum_value becomes Absent because stats2 has Absent sum
2825        assert_eq!(col_stats.sum_value, Precision::Absent);
2826    }
2827
2828    #[test]
2829    fn test_try_merge_iter_empty() {
2830        let schema = Arc::new(Schema::new(vec![Field::new(
2831            "col1",
2832            DataType::Int32,
2833            false,
2834        )]));
2835
2836        let items: Vec<&Statistics> = vec![];
2837        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
2838
2839        assert_eq!(summary_stats.num_rows, Precision::Absent);
2840        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
2841        assert_eq!(summary_stats.column_statistics.len(), 1);
2842        assert_eq!(
2843            summary_stats.column_statistics[0].null_count,
2844            Precision::Absent
2845        );
2846    }
2847
2848    #[test]
2849    fn test_try_merge_iter_single_item() {
2850        let schema = Arc::new(Schema::new(vec![Field::new(
2851            "col1",
2852            DataType::Int32,
2853            false,
2854        )]));
2855
2856        let stats = Statistics {
2857            num_rows: Precision::Exact(10),
2858            total_byte_size: Precision::Exact(100),
2859            column_statistics: vec![ColumnStatistics {
2860                null_count: Precision::Exact(1),
2861                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
2862                min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
2863                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
2864                distinct_count: Precision::Exact(10),
2865                byte_size: Precision::Exact(40),
2866            }],
2867        };
2868
2869        let items = vec![&stats];
2870        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
2871
2872        assert_eq!(summary_stats, stats);
2873    }
2874
2875    #[test]
2876    fn test_try_merge_iter_mismatched_columns() {
2877        let schema = Arc::new(Schema::new(vec![Field::new(
2878            "col1",
2879            DataType::Int32,
2880            false,
2881        )]));
2882
2883        let stats1 = Statistics::default();
2884        let stats2 =
2885            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
2886
2887        let items = vec![&stats1, &stats2];
2888        let e = Statistics::try_merge_iter(items, &schema).unwrap_err();
2889        assert_contains!(
2890            e.to_string(),
2891            "Cannot merge statistics with different number of columns: 0 vs 1"
2892        );
2893    }
2894
2895    #[test]
2896    fn test_try_merge_iter_three_items() {
2897        // Verify that merging three items works correctly
2898        let schema = Arc::new(Schema::new(vec![Field::new(
2899            "col1",
2900            DataType::Int64,
2901            false,
2902        )]));
2903
2904        let stats1 = Statistics {
2905            num_rows: Precision::Exact(10),
2906            total_byte_size: Precision::Exact(100),
2907            column_statistics: vec![ColumnStatistics {
2908                null_count: Precision::Exact(1),
2909                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
2910                min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
2911                sum_value: Precision::Exact(ScalarValue::Int64(Some(500))),
2912                distinct_count: Precision::Exact(8),
2913                byte_size: Precision::Exact(80),
2914            }],
2915        };
2916
2917        let stats2 = Statistics {
2918            num_rows: Precision::Exact(20),
2919            total_byte_size: Precision::Exact(200),
2920            column_statistics: vec![ColumnStatistics {
2921                null_count: Precision::Exact(2),
2922                max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
2923                min_value: Precision::Exact(ScalarValue::Int64(Some(5))),
2924                sum_value: Precision::Exact(ScalarValue::Int64(Some(1000))),
2925                distinct_count: Precision::Exact(15),
2926                byte_size: Precision::Exact(160),
2927            }],
2928        };
2929
2930        let stats3 = Statistics {
2931            num_rows: Precision::Exact(30),
2932            total_byte_size: Precision::Exact(300),
2933            column_statistics: vec![ColumnStatistics {
2934                null_count: Precision::Exact(3),
2935                max_value: Precision::Exact(ScalarValue::Int64(Some(150))),
2936                min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
2937                sum_value: Precision::Exact(ScalarValue::Int64(Some(2000))),
2938                distinct_count: Precision::Exact(25),
2939                byte_size: Precision::Exact(240),
2940            }],
2941        };
2942
2943        let items = vec![&stats1, &stats2, &stats3];
2944        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
2945
2946        assert_eq!(summary_stats.num_rows, Precision::Exact(60));
2947        assert_eq!(summary_stats.total_byte_size, Precision::Exact(600));
2948
2949        let col_stats = &summary_stats.column_statistics[0];
2950        assert_eq!(col_stats.null_count, Precision::Exact(6));
2951        assert_eq!(
2952            col_stats.max_value,
2953            Precision::Exact(ScalarValue::Int64(Some(200)))
2954        );
2955        assert_eq!(
2956            col_stats.min_value,
2957            Precision::Exact(ScalarValue::Int64(Some(1)))
2958        );
2959        assert_eq!(
2960            col_stats.sum_value,
2961            Precision::Exact(ScalarValue::Int64(Some(3500)))
2962        );
2963        assert_eq!(col_stats.byte_size, Precision::Exact(480));
2964        // Overlap-based NDV merge (pairwise left-to-right):
2965        // stats1+stats2: [10,100]+[5,200] -> NDV=16, then +stats3: [5,200]+[1,150] -> NDV=29
2966        assert_eq!(col_stats.distinct_count, Precision::Inexact(29));
2967    }
2968
2969    #[test]
2970    fn test_try_merge_iter_float_types() {
2971        let schema = Arc::new(Schema::new(vec![Field::new(
2972            "col1",
2973            DataType::Float64,
2974            false,
2975        )]));
2976
2977        let stats1 = Statistics {
2978            num_rows: Precision::Exact(10),
2979            total_byte_size: Precision::Exact(80),
2980            column_statistics: vec![ColumnStatistics {
2981                null_count: Precision::Exact(0),
2982                max_value: Precision::Exact(ScalarValue::Float64(Some(99.9))),
2983                min_value: Precision::Exact(ScalarValue::Float64(Some(1.1))),
2984                sum_value: Precision::Exact(ScalarValue::Float64(Some(500.5))),
2985                distinct_count: Precision::Absent,
2986                byte_size: Precision::Exact(80),
2987            }],
2988        };
2989
2990        let stats2 = Statistics {
2991            num_rows: Precision::Exact(10),
2992            total_byte_size: Precision::Exact(80),
2993            column_statistics: vec![ColumnStatistics {
2994                null_count: Precision::Exact(0),
2995                max_value: Precision::Exact(ScalarValue::Float64(Some(200.0))),
2996                min_value: Precision::Exact(ScalarValue::Float64(Some(0.5))),
2997                sum_value: Precision::Exact(ScalarValue::Float64(Some(1000.0))),
2998                distinct_count: Precision::Absent,
2999                byte_size: Precision::Exact(80),
3000            }],
3001        };
3002
3003        let items = vec![&stats1, &stats2];
3004        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
3005
3006        let col_stats = &summary_stats.column_statistics[0];
3007        assert_eq!(
3008            col_stats.max_value,
3009            Precision::Exact(ScalarValue::Float64(Some(200.0)))
3010        );
3011        assert_eq!(
3012            col_stats.min_value,
3013            Precision::Exact(ScalarValue::Float64(Some(0.5)))
3014        );
3015        assert_eq!(
3016            col_stats.sum_value,
3017            Precision::Exact(ScalarValue::Float64(Some(1500.5)))
3018        );
3019    }
3020
3021    #[test]
3022    fn test_try_merge_iter_string_types() {
3023        let schema =
3024            Arc::new(Schema::new(vec![Field::new("col1", DataType::Utf8, false)]));
3025
3026        let stats1 = Statistics {
3027            num_rows: Precision::Exact(10),
3028            total_byte_size: Precision::Exact(100),
3029            column_statistics: vec![ColumnStatistics {
3030                null_count: Precision::Exact(0),
3031                max_value: Precision::Exact(ScalarValue::Utf8(Some("dog".to_string()))),
3032                min_value: Precision::Exact(ScalarValue::Utf8(Some("ant".to_string()))),
3033                sum_value: Precision::Absent,
3034                distinct_count: Precision::Absent,
3035                byte_size: Precision::Exact(100),
3036            }],
3037        };
3038
3039        let stats2 = Statistics {
3040            num_rows: Precision::Exact(10),
3041            total_byte_size: Precision::Exact(100),
3042            column_statistics: vec![ColumnStatistics {
3043                null_count: Precision::Exact(0),
3044                max_value: Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string()))),
3045                min_value: Precision::Exact(ScalarValue::Utf8(Some("bat".to_string()))),
3046                sum_value: Precision::Absent,
3047                distinct_count: Precision::Absent,
3048                byte_size: Precision::Exact(100),
3049            }],
3050        };
3051
3052        let items = vec![&stats1, &stats2];
3053        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
3054
3055        let col_stats = &summary_stats.column_statistics[0];
3056        assert_eq!(
3057            col_stats.max_value,
3058            Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string())))
3059        );
3060        assert_eq!(
3061            col_stats.min_value,
3062            Precision::Exact(ScalarValue::Utf8(Some("ant".to_string())))
3063        );
3064        assert_eq!(col_stats.sum_value, Precision::Absent);
3065    }
3066
3067    #[test]
3068    fn test_try_merge_iter_all_inexact() {
3069        let schema = Arc::new(Schema::new(vec![Field::new(
3070            "col1",
3071            DataType::Int32,
3072            false,
3073        )]));
3074
3075        let stats1 = Statistics {
3076            num_rows: Precision::Inexact(10),
3077            total_byte_size: Precision::Inexact(100),
3078            column_statistics: vec![ColumnStatistics {
3079                null_count: Precision::Inexact(1),
3080                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
3081                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
3082                sum_value: Precision::Inexact(ScalarValue::Int32(Some(500))),
3083                distinct_count: Precision::Absent,
3084                byte_size: Precision::Inexact(40),
3085            }],
3086        };
3087
3088        let stats2 = Statistics {
3089            num_rows: Precision::Inexact(20),
3090            total_byte_size: Precision::Inexact(200),
3091            column_statistics: vec![ColumnStatistics {
3092                null_count: Precision::Inexact(2),
3093                max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
3094                min_value: Precision::Inexact(ScalarValue::Int32(Some(-5))),
3095                sum_value: Precision::Inexact(ScalarValue::Int32(Some(1000))),
3096                distinct_count: Precision::Absent,
3097                byte_size: Precision::Inexact(60),
3098            }],
3099        };
3100
3101        let items = vec![&stats1, &stats2];
3102        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
3103
3104        assert_eq!(summary_stats.num_rows, Precision::Inexact(30));
3105        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(300));
3106
3107        let col_stats = &summary_stats.column_statistics[0];
3108        assert_eq!(col_stats.null_count, Precision::Inexact(3));
3109        assert_eq!(
3110            col_stats.max_value,
3111            Precision::Inexact(ScalarValue::Int32(Some(200)))
3112        );
3113        assert_eq!(
3114            col_stats.min_value,
3115            Precision::Inexact(ScalarValue::Int32(Some(-5)))
3116        );
3117        assert_eq!(
3118            col_stats.sum_value,
3119            Precision::Inexact(ScalarValue::Int64(Some(1500)))
3120        );
3121    }
3122
3123    #[test]
3124    fn test_precision_min_in_place() {
3125        // Exact vs Exact: keeps the smaller
3126        let mut lhs = Precision::Exact(10);
3127        precision_min(&mut lhs, &Precision::Exact(20));
3128        assert_eq!(lhs, Precision::Exact(10));
3129
3130        let mut lhs = Precision::Exact(20);
3131        precision_min(&mut lhs, &Precision::Exact(10));
3132        assert_eq!(lhs, Precision::Exact(10));
3133
3134        // Equal exact values
3135        let mut lhs = Precision::Exact(5);
3136        precision_min(&mut lhs, &Precision::Exact(5));
3137        assert_eq!(lhs, Precision::Exact(5));
3138
3139        // Mixed exact/inexact: result is Inexact with smaller value
3140        let mut lhs = Precision::Exact(10);
3141        precision_min(&mut lhs, &Precision::Inexact(20));
3142        assert_eq!(lhs, Precision::Inexact(10));
3143
3144        let mut lhs = Precision::Inexact(10);
3145        precision_min(&mut lhs, &Precision::Exact(5));
3146        assert_eq!(lhs, Precision::Inexact(5));
3147
3148        // Inexact vs Inexact
3149        let mut lhs = Precision::Inexact(30);
3150        precision_min(&mut lhs, &Precision::Inexact(20));
3151        assert_eq!(lhs, Precision::Inexact(20));
3152
3153        // Absent makes result Absent
3154        let mut lhs = Precision::Exact(10);
3155        precision_min(&mut lhs, &Precision::Absent);
3156        assert_eq!(lhs, Precision::Absent);
3157
3158        let mut lhs = Precision::<i32>::Absent;
3159        precision_min(&mut lhs, &Precision::Exact(10));
3160        assert_eq!(lhs, Precision::Absent);
3161    }
3162
3163    #[test]
3164    fn test_precision_max_in_place() {
3165        // Exact vs Exact: keeps the larger
3166        let mut lhs = Precision::Exact(10);
3167        precision_max(&mut lhs, &Precision::Exact(20));
3168        assert_eq!(lhs, Precision::Exact(20));
3169
3170        let mut lhs = Precision::Exact(20);
3171        precision_max(&mut lhs, &Precision::Exact(10));
3172        assert_eq!(lhs, Precision::Exact(20));
3173
3174        // Equal exact values
3175        let mut lhs = Precision::Exact(5);
3176        precision_max(&mut lhs, &Precision::Exact(5));
3177        assert_eq!(lhs, Precision::Exact(5));
3178
3179        // Mixed exact/inexact: result is Inexact with larger value
3180        let mut lhs = Precision::Exact(10);
3181        precision_max(&mut lhs, &Precision::Inexact(20));
3182        assert_eq!(lhs, Precision::Inexact(20));
3183
3184        let mut lhs = Precision::Inexact(10);
3185        precision_max(&mut lhs, &Precision::Exact(5));
3186        assert_eq!(lhs, Precision::Inexact(10));
3187
3188        // Inexact vs Inexact
3189        let mut lhs = Precision::Inexact(20);
3190        precision_max(&mut lhs, &Precision::Inexact(30));
3191        assert_eq!(lhs, Precision::Inexact(30));
3192
3193        // Absent makes result Absent
3194        let mut lhs = Precision::Exact(10);
3195        precision_max(&mut lhs, &Precision::Absent);
3196        assert_eq!(lhs, Precision::Absent);
3197
3198        let mut lhs = Precision::<i32>::Absent;
3199        precision_max(&mut lhs, &Precision::Exact(10));
3200        assert_eq!(lhs, Precision::Absent);
3201    }
3202
3203    #[test]
3204    fn test_cast_sum_value_to_sum_type_in_place_widens_int32() {
3205        let mut value = Precision::Exact(ScalarValue::Int32(Some(42)));
3206        cast_sum_value_to_sum_type_in_place(&mut value);
3207        assert_eq!(value, Precision::Exact(ScalarValue::Int64(Some(42))));
3208    }
3209
3210    #[test]
3211    fn test_cast_sum_value_to_sum_type_in_place_preserves_int64() {
3212        // Int64 is already the sum type for Int64, no widening needed
3213        let mut value = Precision::Exact(ScalarValue::Int64(Some(100)));
3214        cast_sum_value_to_sum_type_in_place(&mut value);
3215        assert_eq!(value, Precision::Exact(ScalarValue::Int64(Some(100))));
3216    }
3217
3218    #[test]
3219    fn test_cast_sum_value_to_sum_type_in_place_inexact() {
3220        let mut value = Precision::Inexact(ScalarValue::Int32(Some(42)));
3221        cast_sum_value_to_sum_type_in_place(&mut value);
3222        assert_eq!(value, Precision::Inexact(ScalarValue::Int64(Some(42))));
3223    }
3224
3225    #[test]
3226    fn test_cast_sum_value_to_sum_type_in_place_absent() {
3227        let mut value = Precision::<ScalarValue>::Absent;
3228        cast_sum_value_to_sum_type_in_place(&mut value);
3229        assert_eq!(value, Precision::Absent);
3230    }
3231
3232    #[test]
3233    fn test_precision_add_for_sum_in_place_same_type() {
3234        // Int64 + Int64: no widening needed, straight add
3235        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
3236        let rhs = Precision::Exact(ScalarValue::Int64(Some(20)));
3237        precision_add_for_sum_in_place(&mut lhs, &rhs);
3238        assert_eq!(lhs, Precision::Exact(ScalarValue::Int64(Some(30))));
3239    }
3240
3241    #[test]
3242    fn test_precision_add_for_sum_in_place_widens_rhs() {
3243        // lhs is already Int64 (widened), rhs is Int32 -> gets cast to Int64
3244        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
3245        let rhs = Precision::Exact(ScalarValue::Int32(Some(5)));
3246        precision_add_for_sum_in_place(&mut lhs, &rhs);
3247        assert_eq!(lhs, Precision::Exact(ScalarValue::Int64(Some(15))));
3248    }
3249
3250    #[test]
3251    fn test_precision_add_for_sum_in_place_inexact() {
3252        let mut lhs = Precision::Inexact(ScalarValue::Int64(Some(10)));
3253        let rhs = Precision::Inexact(ScalarValue::Int32(Some(5)));
3254        precision_add_for_sum_in_place(&mut lhs, &rhs);
3255        assert_eq!(lhs, Precision::Inexact(ScalarValue::Int64(Some(15))));
3256    }
3257
3258    #[test]
3259    fn test_precision_add_for_sum_in_place_absent_rhs() {
3260        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
3261        precision_add_for_sum_in_place(&mut lhs, &Precision::Absent);
3262        assert_eq!(lhs, Precision::Absent);
3263    }
3264}