sql_cli/sql/functions/
statistics.rs

1use crate::data::datatable::DataValue;
2use crate::sql::functions::{ArgCount, FunctionCategory, FunctionSignature, SqlFunction};
3use anyhow::{anyhow, Result};
4
5// Note: These are placeholder implementations for statistical functions.
6// In a full SQL engine, these would be handled as aggregate functions
7// that accumulate values row by row during query execution.
8// For now, these implementations work with single values or
9// would need to be integrated with the aggregate system.
10
11/// MEDIAN() - Returns the middle value (placeholder)
12pub struct MedianFunction;
13
14impl SqlFunction for MedianFunction {
15    fn signature(&self) -> FunctionSignature {
16        FunctionSignature {
17            name: "MEDIAN",
18            category: FunctionCategory::Statistical,
19            arg_count: ArgCount::Fixed(1),
20            description:
21                "Returns the median (middle value) of a numeric column (aggregate function)",
22            returns: "Numeric value representing the median",
23            examples: vec![
24                "SELECT MEDIAN(salary) FROM employees",
25                "SELECT department, MEDIAN(age) FROM users GROUP BY department",
26            ],
27        }
28    }
29
30    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
31        if args.len() != 1 {
32            return Err(anyhow!("MEDIAN requires exactly 1 argument"));
33        }
34
35        // For now, just return the input as this needs aggregate handling
36        match &args[0] {
37            DataValue::Integer(i) => Ok(DataValue::Float(*i as f64)),
38            DataValue::Float(f) => Ok(DataValue::Float(*f)),
39            _ => Ok(DataValue::Null),
40        }
41    }
42}
43
44/// PERCENTILE() - Returns the nth percentile value (placeholder)
45pub struct PercentileFunction;
46
47impl SqlFunction for PercentileFunction {
48    fn signature(&self) -> FunctionSignature {
49        FunctionSignature {
50            name: "PERCENTILE",
51            category: FunctionCategory::Statistical,
52            arg_count: ArgCount::Fixed(2),
53            description: "Returns the nth percentile of values (0-100) (aggregate function)",
54            returns: "Numeric value at the specified percentile",
55            examples: vec![
56                "SELECT PERCENTILE(score, 75) FROM tests", // 75th percentile
57                "SELECT PERCENTILE(income, 50) FROM users", // 50th percentile (median)
58                "SELECT PERCENTILE(response_time, 95) FROM requests", // 95th percentile
59            ],
60        }
61    }
62
63    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
64        if args.len() != 2 {
65            return Err(anyhow!(
66                "PERCENTILE requires exactly 2 arguments: (value, percentile)"
67            ));
68        }
69
70        // Extract the percentile value (0-100)
71        let percentile = match &args[1] {
72            DataValue::Integer(i) => *i as f64,
73            DataValue::Float(f) => *f,
74            _ => return Err(anyhow!("Percentile must be a number between 0 and 100")),
75        };
76
77        if percentile < 0.0 || percentile > 100.0 {
78            return Err(anyhow!("Percentile must be between 0 and 100"));
79        }
80
81        // For now, just return the input as this needs aggregate handling
82        match &args[0] {
83            DataValue::Integer(i) => Ok(DataValue::Float(*i as f64)),
84            DataValue::Float(f) => Ok(DataValue::Float(*f)),
85            _ => Ok(DataValue::Null),
86        }
87    }
88}
89
90/// MODE() - Returns the most frequent value (placeholder)
91pub struct ModeFunction;
92
93impl SqlFunction for ModeFunction {
94    fn signature(&self) -> FunctionSignature {
95        FunctionSignature {
96            name: "MODE",
97            category: FunctionCategory::Statistical,
98            arg_count: ArgCount::Fixed(1),
99            description: "Returns the most frequently occurring value (aggregate function)",
100            returns: "The mode value (most frequent)",
101            examples: vec![
102                "SELECT MODE(category) FROM products",
103                "SELECT MODE(rating) FROM reviews",
104                "SELECT department, MODE(job_level) FROM employees GROUP BY department",
105            ],
106        }
107    }
108
109    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
110        if args.len() != 1 {
111            return Err(anyhow!("MODE requires exactly 1 argument"));
112        }
113
114        // For now, just return the input as this needs aggregate handling
115        Ok(args[0].clone())
116    }
117}
118
119/// VARIANCE() - Population variance (placeholder)
120pub struct VarianceFunction;
121
122impl SqlFunction for VarianceFunction {
123    fn signature(&self) -> FunctionSignature {
124        FunctionSignature {
125            name: "VARIANCE",
126            category: FunctionCategory::Statistical,
127            arg_count: ArgCount::Fixed(1),
128            description:
129                "Calculates the population variance of numeric values (aggregate function)",
130            returns: "Numeric value representing the variance",
131            examples: vec![
132                "SELECT VARIANCE(price) FROM products",
133                "SELECT category, VARIANCE(quantity) FROM inventory GROUP BY category",
134            ],
135        }
136    }
137
138    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
139        if args.len() != 1 {
140            return Err(anyhow!("VARIANCE requires exactly 1 argument"));
141        }
142
143        // For now, just return 0 as this needs aggregate handling
144        match &args[0] {
145            DataValue::Integer(_) | DataValue::Float(_) => Ok(DataValue::Float(0.0)),
146            _ => Ok(DataValue::Null),
147        }
148    }
149}
150
151/// VAR_SAMP() - Sample variance (placeholder)
152pub struct VarSampFunction;
153
154impl SqlFunction for VarSampFunction {
155    fn signature(&self) -> FunctionSignature {
156        FunctionSignature {
157            name: "VAR_SAMP",
158            category: FunctionCategory::Statistical,
159            arg_count: ArgCount::Fixed(1),
160            description: "Calculates the sample variance of numeric values (aggregate function)",
161            returns: "Numeric value representing the sample variance",
162            examples: vec![
163                "SELECT VAR_SAMP(score) FROM test_results",
164                "SELECT class, VAR_SAMP(height) FROM students GROUP BY class",
165            ],
166        }
167    }
168
169    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
170        if args.len() != 1 {
171            return Err(anyhow!("VAR_SAMP requires exactly 1 argument"));
172        }
173
174        // For now, just return 0 as this needs aggregate handling
175        match &args[0] {
176            DataValue::Integer(_) | DataValue::Float(_) => Ok(DataValue::Float(0.0)),
177            _ => Ok(DataValue::Null),
178        }
179    }
180}
181
182/// VAR_POP() - Population variance (alias for VARIANCE) (placeholder)
183pub struct VarPopFunction;
184
185impl SqlFunction for VarPopFunction {
186    fn signature(&self) -> FunctionSignature {
187        FunctionSignature {
188            name: "VAR_POP",
189            category: FunctionCategory::Statistical,
190            arg_count: ArgCount::Fixed(1),
191            description:
192                "Calculates the population variance (same as VARIANCE) (aggregate function)",
193            returns: "Numeric value representing the population variance",
194            examples: vec!["SELECT VAR_POP(amount) FROM transactions"],
195        }
196    }
197
198    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
199        // Delegate to VARIANCE function
200        VarianceFunction.evaluate(args)
201    }
202}
203
204/// CORR() - Correlation coefficient (placeholder)
205pub struct CorrelationFunction;
206
207impl SqlFunction for CorrelationFunction {
208    fn signature(&self) -> FunctionSignature {
209        FunctionSignature {
210            name: "CORR",
211            category: FunctionCategory::Statistical,
212            arg_count: ArgCount::Fixed(2),
213            description: "Calculates the Pearson correlation coefficient between two columns (aggregate function)",
214            returns: "Numeric value between -1 and 1",
215            examples: vec![
216                "SELECT CORR(height, weight) FROM people",
217                "SELECT CORR(temperature, ice_cream_sales) FROM daily_data",
218                "SELECT month, CORR(advertising_spend, revenue) FROM sales GROUP BY month",
219            ],
220        }
221    }
222
223    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
224        if args.len() != 2 {
225            return Err(anyhow!("CORR requires exactly 2 arguments"));
226        }
227
228        // For now, just return 0 as this needs aggregate handling
229        match (&args[0], &args[1]) {
230            (DataValue::Integer(_), DataValue::Integer(_))
231            | (DataValue::Float(_), DataValue::Float(_))
232            | (DataValue::Integer(_), DataValue::Float(_))
233            | (DataValue::Float(_), DataValue::Integer(_)) => Ok(DataValue::Float(0.0)),
234            _ => Ok(DataValue::Null),
235        }
236    }
237}
238
239// Additional useful statistical functions that work on single values
240
241/// SKEW() - Calculate skewness of a single value from mean
242pub struct SkewFunction;
243
244impl SqlFunction for SkewFunction {
245    fn signature(&self) -> FunctionSignature {
246        FunctionSignature {
247            name: "SKEW",
248            category: FunctionCategory::Statistical,
249            arg_count: ArgCount::Fixed(3),
250            description: "Calculate skewness contribution: ((x - mean) / stddev)^3",
251            returns: "Skewness contribution of a single value",
252            examples: vec![
253                "SELECT SKEW(value, mean, stddev) FROM measurements",
254                "SELECT SKEW(100, 85, 15) -- returns ~1.0 for value one std dev above mean",
255            ],
256        }
257    }
258
259    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
260        if args.len() != 3 {
261            return Err(anyhow!("SKEW requires 3 arguments: value, mean, stddev"));
262        }
263
264        let value = match &args[0] {
265            DataValue::Integer(i) => *i as f64,
266            DataValue::Float(f) => *f,
267            _ => return Ok(DataValue::Null),
268        };
269
270        let mean = match &args[1] {
271            DataValue::Integer(i) => *i as f64,
272            DataValue::Float(f) => *f,
273            _ => return Ok(DataValue::Null),
274        };
275
276        let stddev = match &args[2] {
277            DataValue::Integer(i) => *i as f64,
278            DataValue::Float(f) => *f,
279            _ => return Ok(DataValue::Null),
280        };
281
282        if stddev == 0.0 {
283            return Ok(DataValue::Null);
284        }
285
286        let z_score = (value - mean) / stddev;
287        Ok(DataValue::Float(z_score.powi(3)))
288    }
289}
290
291/// KURTOSIS() - Calculate kurtosis of a single value from mean
292pub struct KurtosisFunction;
293
294impl SqlFunction for KurtosisFunction {
295    fn signature(&self) -> FunctionSignature {
296        FunctionSignature {
297            name: "KURTOSIS",
298            category: FunctionCategory::Statistical,
299            arg_count: ArgCount::Fixed(3),
300            description: "Calculate kurtosis contribution: ((x - mean) / stddev)^4",
301            returns: "Kurtosis contribution of a single value",
302            examples: vec![
303                "SELECT KURTOSIS(value, mean, stddev) FROM measurements",
304                "SELECT KURTOSIS(100, 85, 15) -- returns 1.0 for value one std dev above mean",
305            ],
306        }
307    }
308
309    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
310        if args.len() != 3 {
311            return Err(anyhow!(
312                "KURTOSIS requires 3 arguments: value, mean, stddev"
313            ));
314        }
315
316        let value = match &args[0] {
317            DataValue::Integer(i) => *i as f64,
318            DataValue::Float(f) => *f,
319            _ => return Ok(DataValue::Null),
320        };
321
322        let mean = match &args[1] {
323            DataValue::Integer(i) => *i as f64,
324            DataValue::Float(f) => *f,
325            _ => return Ok(DataValue::Null),
326        };
327
328        let stddev = match &args[2] {
329            DataValue::Integer(i) => *i as f64,
330            DataValue::Float(f) => *f,
331            _ => return Ok(DataValue::Null),
332        };
333
334        if stddev == 0.0 {
335            return Ok(DataValue::Null);
336        }
337
338        let z_score = (value - mean) / stddev;
339        Ok(DataValue::Float(z_score.powi(4)))
340    }
341}