Skip to main content

nexcore_dataframe/
agg.rs

1//! Scalar aggregation functions on columns.
2
3use crate::column::{Column, ColumnData};
4use crate::error::DataFrameError;
5use crate::scalar::Scalar;
6
7impl Column {
8    /// Sum of non-null numeric values. Returns Null for empty/non-numeric columns.
9    #[must_use]
10    pub fn sum(&self) -> Scalar {
11        match self.data() {
12            ColumnData::Int64(v) => {
13                let total: i64 = v.iter().filter_map(|o| *o).sum();
14                Scalar::Int64(total)
15            }
16            ColumnData::UInt64(v) => {
17                let total: u64 = v.iter().filter_map(|o| *o).sum();
18                Scalar::UInt64(total)
19            }
20            ColumnData::Float64(v) => {
21                let total: f64 = v.iter().filter_map(|o| *o).sum();
22                Scalar::Float64(total)
23            }
24            ColumnData::Bool(_) | ColumnData::String(_) => Scalar::Null,
25        }
26    }
27
28    /// Mean of non-null numeric values. Returns Null if no non-null values.
29    #[must_use]
30    pub fn mean(&self) -> Scalar {
31        let count = self.non_null_count();
32        if count == 0 {
33            return Scalar::Null;
34        }
35        // count > 0 is verified above; int→f64 widening casts are safe for practical values
36        #[allow(
37            clippy::as_conversions,
38            reason = "i64/u64→f64 widening cast for numeric mean; count→f64 safe as count <= usize::MAX << 2^53"
39        )]
40        match self.data() {
41            ColumnData::Int64(v) => {
42                let total: f64 = v.iter().filter_map(|o| o.map(|n| n as f64)).sum();
43                Scalar::Float64(total / count as f64)
44            }
45            ColumnData::UInt64(v) => {
46                let total: f64 = v.iter().filter_map(|o| o.map(|n| n as f64)).sum();
47                Scalar::Float64(total / count as f64)
48            }
49            ColumnData::Float64(v) => {
50                let total: f64 = v.iter().filter_map(|o| *o).sum();
51                Scalar::Float64(total / count as f64)
52            }
53            ColumnData::Bool(_) | ColumnData::String(_) => Scalar::Null,
54        }
55    }
56
57    /// Minimum non-null value. Returns Null if no non-null values.
58    #[must_use]
59    pub fn min(&self) -> Scalar {
60        let mut result = Scalar::Null;
61        for i in 0..self.len() {
62            if let Some(val) = self.get(i) {
63                if val.is_null() {
64                    continue;
65                }
66                if result.is_null() || val.compare(&result) == std::cmp::Ordering::Less {
67                    result = val;
68                }
69            }
70        }
71        result
72    }
73
74    /// Maximum non-null value. Returns Null if no non-null values.
75    #[must_use]
76    pub fn max(&self) -> Scalar {
77        let mut result = Scalar::Null;
78        for i in 0..self.len() {
79            if let Some(val) = self.get(i) {
80                if val.is_null() {
81                    continue;
82                }
83                if result.is_null() || val.compare(&result) == std::cmp::Ordering::Greater {
84                    result = val;
85                }
86            }
87        }
88        result
89    }
90
91    /// Median of non-null numeric values. Returns Null if no non-null values.
92    #[must_use]
93    pub fn median(&self) -> Scalar {
94        // int→f64 widening casts are safe for practical column values
95        #[allow(
96            clippy::as_conversions,
97            reason = "i64/u64→f64 widening cast for median computation; precision loss only beyond ±2^53"
98        )]
99        let mut vals: Vec<f64> = match self.data() {
100            ColumnData::Int64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
101            ColumnData::UInt64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
102            ColumnData::Float64(v) => v.iter().filter_map(|o| *o).collect(),
103            ColumnData::Bool(_) | ColumnData::String(_) => return Scalar::Null,
104        };
105        if vals.is_empty() {
106            return Scalar::Null;
107        }
108        vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
109        let mid = vals.len() / 2;
110        // vals is non-empty (checked above); mid = len/2 so mid < len
111        // for even len: mid >= 1 (since len >= 2 when even and non-empty), mid-1 < mid < len
112        // for odd len: mid < len
113        #[allow(
114            clippy::indexing_slicing,
115            reason = "mid = len/2 so mid < len; for even len mid >= 1 since len >= 2"
116        )]
117        #[allow(
118            clippy::arithmetic_side_effects,
119            reason = "mid = len/2 >= 1 when len is even and non-empty (len >= 2); subtraction cannot underflow"
120        )]
121        if vals.len() % 2 == 0 {
122            Scalar::Float64((vals[mid - 1] + vals[mid]) / 2.0)
123        } else {
124            Scalar::Float64(vals[mid])
125        }
126    }
127
128    /// Standard deviation (population) of non-null numeric values.
129    #[must_use]
130    pub fn std_dev(&self) -> Scalar {
131        let mean = match self.mean() {
132            Scalar::Float64(m) => m,
133            Scalar::Null
134            | Scalar::Bool(_)
135            | Scalar::Int64(_)
136            | Scalar::UInt64(_)
137            | Scalar::String(_) => {
138                return Scalar::Null;
139            }
140        };
141        let count = self.non_null_count();
142        if count == 0 {
143            return Scalar::Null;
144        }
145        // int→f64 widening casts; count→f64 safe as count << 2^53
146        #[allow(
147            clippy::as_conversions,
148            reason = "i64/u64→f64 widening cast for variance computation; count→f64 safe since Vec capacity is bounded by usize << 2^53"
149        )]
150        let variance: f64 = match self.data() {
151            ColumnData::Int64(v) => {
152                v.iter()
153                    .filter_map(|o| o.map(|n| (n as f64 - mean).powi(2)))
154                    .sum::<f64>()
155                    / count as f64
156            }
157            ColumnData::UInt64(v) => {
158                v.iter()
159                    .filter_map(|o| o.map(|n| (n as f64 - mean).powi(2)))
160                    .sum::<f64>()
161                    / count as f64
162            }
163            ColumnData::Float64(v) => {
164                v.iter()
165                    .filter_map(|o| o.map(|n| (n - mean).powi(2)))
166                    .sum::<f64>()
167                    / count as f64
168            }
169            ColumnData::Bool(_) | ColumnData::String(_) => return Scalar::Null,
170        };
171        Scalar::Float64(variance.sqrt())
172    }
173
174    /// Count of unique non-null values.
175    #[must_use]
176    pub fn n_unique(&self) -> usize {
177        // HashSet used purely for deduplication — returns only the count, not the set.
178        // Order of the set never matters here; O(1) insert beats BTreeSet O(log n).
179        #[allow(
180            clippy::disallowed_types,
181            reason = "HashSet used for O(1) deduplication; only the count is returned, set order is irrelevant"
182        )]
183        use std::collections::HashSet;
184        #[allow(
185            clippy::disallowed_types,
186            reason = "HashSet::new() for n_unique deduplication; see inline allow above"
187        )]
188        let mut seen = HashSet::new();
189        for i in 0..self.len() {
190            if let Some(val) = self.get(i) {
191                if !val.is_null() {
192                    seen.insert(format!("{val}"));
193                }
194            }
195        }
196        seen.len()
197    }
198
199    /// First non-null value in the column.
200    #[must_use]
201    pub fn first(&self) -> Scalar {
202        for i in 0..self.len() {
203            if let Some(val) = self.get(i) {
204                if !val.is_null() {
205                    return val;
206                }
207            }
208        }
209        Scalar::Null
210    }
211
212    /// Last non-null value in the column.
213    #[must_use]
214    pub fn last(&self) -> Scalar {
215        for i in (0..self.len()).rev() {
216            if let Some(val) = self.get(i) {
217                if !val.is_null() {
218                    return val;
219                }
220            }
221        }
222        Scalar::Null
223    }
224
225    /// Get a quantile (0.0 to 1.0) from non-null numeric values.
226    pub fn quantile(&self, q: f64) -> Result<Scalar, DataFrameError> {
227        if !(0.0..=1.0).contains(&q) {
228            return Err(DataFrameError::Other(format!(
229                "quantile must be between 0.0 and 1.0, got {q}"
230            )));
231        }
232        // int→f64 widening casts are safe for practical column values
233        #[allow(
234            clippy::as_conversions,
235            reason = "i64/u64→f64 widening for quantile; (len-1)→f64 safe since Vec len << 2^53; floor/ceil→usize: pos is in [0, len-1] so fits usize"
236        )]
237        let mut vals: Vec<f64> = match self.data() {
238            ColumnData::Int64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
239            ColumnData::UInt64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
240            ColumnData::Float64(v) => v.iter().filter_map(|o| *o).collect(),
241            ColumnData::Bool(_) | ColumnData::String(_) => return Ok(Scalar::Null),
242        };
243        if vals.is_empty() {
244            return Ok(Scalar::Null);
245        }
246        vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
247        // vals.len() >= 1 (checked above); q in [0,1]; pos in [0, len-1]
248        // lower = floor(pos) <= upper = ceil(pos) <= len-1, so both indices are valid
249        // vals.len() >= 1, so vals.len() - 1 cannot underflow
250        // All `as` casts here are safe: len-1 < 2^53 (Vec bound); pos in [0, len-1] so
251        // floor/ceil fit usize; lower <= len-1 so lower fits f64 exactly.
252        #[allow(
253            clippy::as_conversions,
254            clippy::arithmetic_side_effects,
255            reason = "len-1 safe: vals non-empty (is_empty check); len-1→f64 exact (Vec << 2^53); \
256                      floor/ceil→usize: pos in [0,len-1] fits usize; lower→f64 exact (lower <= len-1 << 2^53)"
257        )]
258        let pos = q * (vals.len() - 1) as f64;
259        #[allow(
260            clippy::as_conversions,
261            reason = "f64→usize: pos.floor()/ceil() are in [0, vals.len()-1] which fits usize on all platforms"
262        )]
263        let lower = pos.floor() as usize;
264        #[allow(
265            clippy::as_conversions,
266            reason = "f64→usize: pos.ceil() is in [0, vals.len()-1] which fits usize on all platforms"
267        )]
268        let upper = pos.ceil() as usize;
269        #[allow(
270            clippy::indexing_slicing,
271            reason = "lower and upper are floor/ceil of q*(len-1) in [0,len-1]; both are valid indices into vals"
272        )]
273        if lower == upper {
274            Ok(Scalar::Float64(vals[lower]))
275        } else {
276            #[allow(
277                clippy::as_conversions,
278                reason = "lower→f64: lower <= len-1 << 2^53, fits exactly"
279            )]
280            let frac = pos - lower as f64;
281            Ok(Scalar::Float64(
282                vals[lower] * (1.0 - frac) + vals[upper] * frac,
283            ))
284        }
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    #[test]
293    fn sum_i64() {
294        let c = Column::from_i64s("x", vec![1, 2, 3]);
295        assert_eq!(c.sum(), Scalar::Int64(6));
296    }
297
298    #[test]
299    fn sum_f64() {
300        let c = Column::from_f64s("x", vec![1.0, 2.5, 3.5]);
301        assert_eq!(c.sum(), Scalar::Float64(7.0));
302    }
303
304    #[test]
305    fn sum_with_nulls() {
306        let c = Column::new_i64("x", vec![Some(10), None, Some(20)]);
307        assert_eq!(c.sum(), Scalar::Int64(30));
308    }
309
310    #[test]
311    fn sum_string_returns_null() {
312        let c = Column::from_strs("x", &["a", "b"]);
313        assert_eq!(c.sum(), Scalar::Null);
314    }
315
316    #[test]
317    fn mean_i64() {
318        let c = Column::from_i64s("x", vec![2, 4, 6]);
319        assert_eq!(c.mean(), Scalar::Float64(4.0));
320    }
321
322    #[test]
323    fn mean_empty() {
324        let c = Column::new_i64("x", vec![]);
325        assert_eq!(c.mean(), Scalar::Null);
326    }
327
328    #[test]
329    fn min_max() {
330        let c = Column::from_i64s("x", vec![3, 1, 4, 1, 5]);
331        assert_eq!(c.min(), Scalar::Int64(1));
332        assert_eq!(c.max(), Scalar::Int64(5));
333    }
334
335    #[test]
336    fn min_max_with_nulls() {
337        let c = Column::new_i64("x", vec![Some(3), None, Some(1)]);
338        assert_eq!(c.min(), Scalar::Int64(1));
339        assert_eq!(c.max(), Scalar::Int64(3));
340    }
341
342    #[test]
343    fn median_odd() {
344        let c = Column::from_i64s("x", vec![3, 1, 2]);
345        assert_eq!(c.median(), Scalar::Float64(2.0));
346    }
347
348    #[test]
349    fn median_even() {
350        let c = Column::from_i64s("x", vec![1, 2, 3, 4]);
351        assert_eq!(c.median(), Scalar::Float64(2.5));
352    }
353
354    #[test]
355    fn std_dev_basic() {
356        let c = Column::from_f64s("x", vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]);
357        let sd = match c.std_dev() {
358            Scalar::Float64(v) => v,
359            _ => f64::NAN,
360        };
361        assert!((sd - 2.0).abs() < 0.01);
362    }
363
364    #[test]
365    fn n_unique_basic() {
366        let c = Column::from_strs("x", &["a", "b", "a", "c"]);
367        assert_eq!(c.n_unique(), 3);
368    }
369
370    #[test]
371    fn first_last() {
372        let c = Column::new_i64("x", vec![None, Some(10), Some(20), None]);
373        assert_eq!(c.first(), Scalar::Int64(10));
374        assert_eq!(c.last(), Scalar::Int64(20));
375    }
376
377    #[test]
378    fn quantile_basic() {
379        let c = Column::from_i64s("x", vec![1, 2, 3, 4, 5]);
380        let q50 = c.quantile(0.5);
381        assert_eq!(q50.ok(), Some(Scalar::Float64(3.0)));
382        let q0 = c.quantile(0.0);
383        assert_eq!(q0.ok(), Some(Scalar::Float64(1.0)));
384        let q100 = c.quantile(1.0);
385        assert_eq!(q100.ok(), Some(Scalar::Float64(5.0)));
386    }
387
388    #[test]
389    fn quantile_invalid() {
390        let c = Column::from_i64s("x", vec![1, 2, 3]);
391        assert!(c.quantile(1.5).is_err());
392        assert!(c.quantile(-0.1).is_err());
393    }
394}