1use crate::column::{Column, ColumnData};
4use crate::error::DataFrameError;
5use crate::scalar::Scalar;
6
7impl Column {
8 #[must_use]
10 pub fn sum(&self) -> Scalar {
11 match self.data() {
12 ColumnData::Int64(v) => {
13 let total: i64 = v.iter().filter_map(|o| *o).sum();
14 Scalar::Int64(total)
15 }
16 ColumnData::UInt64(v) => {
17 let total: u64 = v.iter().filter_map(|o| *o).sum();
18 Scalar::UInt64(total)
19 }
20 ColumnData::Float64(v) => {
21 let total: f64 = v.iter().filter_map(|o| *o).sum();
22 Scalar::Float64(total)
23 }
24 ColumnData::Bool(_) | ColumnData::String(_) => Scalar::Null,
25 }
26 }
27
28 #[must_use]
30 pub fn mean(&self) -> Scalar {
31 let count = self.non_null_count();
32 if count == 0 {
33 return Scalar::Null;
34 }
35 #[allow(
37 clippy::as_conversions,
38 reason = "i64/u64→f64 widening cast for numeric mean; count→f64 safe as count <= usize::MAX << 2^53"
39 )]
40 match self.data() {
41 ColumnData::Int64(v) => {
42 let total: f64 = v.iter().filter_map(|o| o.map(|n| n as f64)).sum();
43 Scalar::Float64(total / count as f64)
44 }
45 ColumnData::UInt64(v) => {
46 let total: f64 = v.iter().filter_map(|o| o.map(|n| n as f64)).sum();
47 Scalar::Float64(total / count as f64)
48 }
49 ColumnData::Float64(v) => {
50 let total: f64 = v.iter().filter_map(|o| *o).sum();
51 Scalar::Float64(total / count as f64)
52 }
53 ColumnData::Bool(_) | ColumnData::String(_) => Scalar::Null,
54 }
55 }
56
57 #[must_use]
59 pub fn min(&self) -> Scalar {
60 let mut result = Scalar::Null;
61 for i in 0..self.len() {
62 if let Some(val) = self.get(i) {
63 if val.is_null() {
64 continue;
65 }
66 if result.is_null() || val.compare(&result) == std::cmp::Ordering::Less {
67 result = val;
68 }
69 }
70 }
71 result
72 }
73
74 #[must_use]
76 pub fn max(&self) -> Scalar {
77 let mut result = Scalar::Null;
78 for i in 0..self.len() {
79 if let Some(val) = self.get(i) {
80 if val.is_null() {
81 continue;
82 }
83 if result.is_null() || val.compare(&result) == std::cmp::Ordering::Greater {
84 result = val;
85 }
86 }
87 }
88 result
89 }
90
91 #[must_use]
93 pub fn median(&self) -> Scalar {
94 #[allow(
96 clippy::as_conversions,
97 reason = "i64/u64→f64 widening cast for median computation; precision loss only beyond ±2^53"
98 )]
99 let mut vals: Vec<f64> = match self.data() {
100 ColumnData::Int64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
101 ColumnData::UInt64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
102 ColumnData::Float64(v) => v.iter().filter_map(|o| *o).collect(),
103 ColumnData::Bool(_) | ColumnData::String(_) => return Scalar::Null,
104 };
105 if vals.is_empty() {
106 return Scalar::Null;
107 }
108 vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
109 let mid = vals.len() / 2;
110 #[allow(
114 clippy::indexing_slicing,
115 reason = "mid = len/2 so mid < len; for even len mid >= 1 since len >= 2"
116 )]
117 #[allow(
118 clippy::arithmetic_side_effects,
119 reason = "mid = len/2 >= 1 when len is even and non-empty (len >= 2); subtraction cannot underflow"
120 )]
121 if vals.len() % 2 == 0 {
122 Scalar::Float64((vals[mid - 1] + vals[mid]) / 2.0)
123 } else {
124 Scalar::Float64(vals[mid])
125 }
126 }
127
128 #[must_use]
130 pub fn std_dev(&self) -> Scalar {
131 let mean = match self.mean() {
132 Scalar::Float64(m) => m,
133 Scalar::Null
134 | Scalar::Bool(_)
135 | Scalar::Int64(_)
136 | Scalar::UInt64(_)
137 | Scalar::String(_) => {
138 return Scalar::Null;
139 }
140 };
141 let count = self.non_null_count();
142 if count == 0 {
143 return Scalar::Null;
144 }
145 #[allow(
147 clippy::as_conversions,
148 reason = "i64/u64→f64 widening cast for variance computation; count→f64 safe since Vec capacity is bounded by usize << 2^53"
149 )]
150 let variance: f64 = match self.data() {
151 ColumnData::Int64(v) => {
152 v.iter()
153 .filter_map(|o| o.map(|n| (n as f64 - mean).powi(2)))
154 .sum::<f64>()
155 / count as f64
156 }
157 ColumnData::UInt64(v) => {
158 v.iter()
159 .filter_map(|o| o.map(|n| (n as f64 - mean).powi(2)))
160 .sum::<f64>()
161 / count as f64
162 }
163 ColumnData::Float64(v) => {
164 v.iter()
165 .filter_map(|o| o.map(|n| (n - mean).powi(2)))
166 .sum::<f64>()
167 / count as f64
168 }
169 ColumnData::Bool(_) | ColumnData::String(_) => return Scalar::Null,
170 };
171 Scalar::Float64(variance.sqrt())
172 }
173
174 #[must_use]
176 pub fn n_unique(&self) -> usize {
177 #[allow(
180 clippy::disallowed_types,
181 reason = "HashSet used for O(1) deduplication; only the count is returned, set order is irrelevant"
182 )]
183 use std::collections::HashSet;
184 #[allow(
185 clippy::disallowed_types,
186 reason = "HashSet::new() for n_unique deduplication; see inline allow above"
187 )]
188 let mut seen = HashSet::new();
189 for i in 0..self.len() {
190 if let Some(val) = self.get(i) {
191 if !val.is_null() {
192 seen.insert(format!("{val}"));
193 }
194 }
195 }
196 seen.len()
197 }
198
199 #[must_use]
201 pub fn first(&self) -> Scalar {
202 for i in 0..self.len() {
203 if let Some(val) = self.get(i) {
204 if !val.is_null() {
205 return val;
206 }
207 }
208 }
209 Scalar::Null
210 }
211
212 #[must_use]
214 pub fn last(&self) -> Scalar {
215 for i in (0..self.len()).rev() {
216 if let Some(val) = self.get(i) {
217 if !val.is_null() {
218 return val;
219 }
220 }
221 }
222 Scalar::Null
223 }
224
225 pub fn quantile(&self, q: f64) -> Result<Scalar, DataFrameError> {
227 if !(0.0..=1.0).contains(&q) {
228 return Err(DataFrameError::Other(format!(
229 "quantile must be between 0.0 and 1.0, got {q}"
230 )));
231 }
232 #[allow(
234 clippy::as_conversions,
235 reason = "i64/u64→f64 widening for quantile; (len-1)→f64 safe since Vec len << 2^53; floor/ceil→usize: pos is in [0, len-1] so fits usize"
236 )]
237 let mut vals: Vec<f64> = match self.data() {
238 ColumnData::Int64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
239 ColumnData::UInt64(v) => v.iter().filter_map(|o| o.map(|n| n as f64)).collect(),
240 ColumnData::Float64(v) => v.iter().filter_map(|o| *o).collect(),
241 ColumnData::Bool(_) | ColumnData::String(_) => return Ok(Scalar::Null),
242 };
243 if vals.is_empty() {
244 return Ok(Scalar::Null);
245 }
246 vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
247 #[allow(
253 clippy::as_conversions,
254 clippy::arithmetic_side_effects,
255 reason = "len-1 safe: vals non-empty (is_empty check); len-1→f64 exact (Vec << 2^53); \
256 floor/ceil→usize: pos in [0,len-1] fits usize; lower→f64 exact (lower <= len-1 << 2^53)"
257 )]
258 let pos = q * (vals.len() - 1) as f64;
259 #[allow(
260 clippy::as_conversions,
261 reason = "f64→usize: pos.floor()/ceil() are in [0, vals.len()-1] which fits usize on all platforms"
262 )]
263 let lower = pos.floor() as usize;
264 #[allow(
265 clippy::as_conversions,
266 reason = "f64→usize: pos.ceil() is in [0, vals.len()-1] which fits usize on all platforms"
267 )]
268 let upper = pos.ceil() as usize;
269 #[allow(
270 clippy::indexing_slicing,
271 reason = "lower and upper are floor/ceil of q*(len-1) in [0,len-1]; both are valid indices into vals"
272 )]
273 if lower == upper {
274 Ok(Scalar::Float64(vals[lower]))
275 } else {
276 #[allow(
277 clippy::as_conversions,
278 reason = "lower→f64: lower <= len-1 << 2^53, fits exactly"
279 )]
280 let frac = pos - lower as f64;
281 Ok(Scalar::Float64(
282 vals[lower] * (1.0 - frac) + vals[upper] * frac,
283 ))
284 }
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291
292 #[test]
293 fn sum_i64() {
294 let c = Column::from_i64s("x", vec![1, 2, 3]);
295 assert_eq!(c.sum(), Scalar::Int64(6));
296 }
297
298 #[test]
299 fn sum_f64() {
300 let c = Column::from_f64s("x", vec![1.0, 2.5, 3.5]);
301 assert_eq!(c.sum(), Scalar::Float64(7.0));
302 }
303
304 #[test]
305 fn sum_with_nulls() {
306 let c = Column::new_i64("x", vec![Some(10), None, Some(20)]);
307 assert_eq!(c.sum(), Scalar::Int64(30));
308 }
309
310 #[test]
311 fn sum_string_returns_null() {
312 let c = Column::from_strs("x", &["a", "b"]);
313 assert_eq!(c.sum(), Scalar::Null);
314 }
315
316 #[test]
317 fn mean_i64() {
318 let c = Column::from_i64s("x", vec![2, 4, 6]);
319 assert_eq!(c.mean(), Scalar::Float64(4.0));
320 }
321
322 #[test]
323 fn mean_empty() {
324 let c = Column::new_i64("x", vec![]);
325 assert_eq!(c.mean(), Scalar::Null);
326 }
327
328 #[test]
329 fn min_max() {
330 let c = Column::from_i64s("x", vec![3, 1, 4, 1, 5]);
331 assert_eq!(c.min(), Scalar::Int64(1));
332 assert_eq!(c.max(), Scalar::Int64(5));
333 }
334
335 #[test]
336 fn min_max_with_nulls() {
337 let c = Column::new_i64("x", vec![Some(3), None, Some(1)]);
338 assert_eq!(c.min(), Scalar::Int64(1));
339 assert_eq!(c.max(), Scalar::Int64(3));
340 }
341
342 #[test]
343 fn median_odd() {
344 let c = Column::from_i64s("x", vec![3, 1, 2]);
345 assert_eq!(c.median(), Scalar::Float64(2.0));
346 }
347
348 #[test]
349 fn median_even() {
350 let c = Column::from_i64s("x", vec![1, 2, 3, 4]);
351 assert_eq!(c.median(), Scalar::Float64(2.5));
352 }
353
354 #[test]
355 fn std_dev_basic() {
356 let c = Column::from_f64s("x", vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]);
357 let sd = match c.std_dev() {
358 Scalar::Float64(v) => v,
359 _ => f64::NAN,
360 };
361 assert!((sd - 2.0).abs() < 0.01);
362 }
363
364 #[test]
365 fn n_unique_basic() {
366 let c = Column::from_strs("x", &["a", "b", "a", "c"]);
367 assert_eq!(c.n_unique(), 3);
368 }
369
370 #[test]
371 fn first_last() {
372 let c = Column::new_i64("x", vec![None, Some(10), Some(20), None]);
373 assert_eq!(c.first(), Scalar::Int64(10));
374 assert_eq!(c.last(), Scalar::Int64(20));
375 }
376
377 #[test]
378 fn quantile_basic() {
379 let c = Column::from_i64s("x", vec![1, 2, 3, 4, 5]);
380 let q50 = c.quantile(0.5);
381 assert_eq!(q50.ok(), Some(Scalar::Float64(3.0)));
382 let q0 = c.quantile(0.0);
383 assert_eq!(q0.ok(), Some(Scalar::Float64(1.0)));
384 let q100 = c.quantile(1.0);
385 assert_eq!(q100.ok(), Some(Scalar::Float64(5.0)));
386 }
387
388 #[test]
389 fn quantile_invalid() {
390 let c = Column::from_i64s("x", vec![1, 2, 3]);
391 assert!(c.quantile(1.5).is_err());
392 assert!(c.quantile(-0.1).is_err());
393 }
394}