Skip to main content

dataprof_core/
profile.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::classification::DataType;
6use crate::pattern::Pattern;
7
8/// Profiling statistics for a single column.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ColumnProfile {
11    pub name: String,
12    pub data_type: DataType,
13    pub null_count: usize,
14    pub total_count: usize,
15    pub unique_count: Option<usize>,
16    pub stats: ColumnStats,
17    pub patterns: Vec<Pattern>,
18}
19
20/// Quartile statistics for numeric distributions.
21#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
22pub struct Quartiles {
23    pub q1: f64,
24    pub q2: f64,
25    pub q3: f64,
26    pub iqr: f64,
27}
28
29/// A value and its frequency count within a column.
30#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
31pub struct FrequencyItem {
32    pub value: String,
33    pub count: usize,
34    #[serde(serialize_with = "crate::serde_helpers::round_2")]
35    pub percentage: f64,
36}
37
38/// Statistics for numeric (integer or float) columns.
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct NumericStats {
41    #[serde(serialize_with = "crate::serde_helpers::round_2")]
42    pub min: f64,
43    #[serde(serialize_with = "crate::serde_helpers::round_2")]
44    pub max: f64,
45    #[serde(serialize_with = "crate::serde_helpers::round_4")]
46    pub mean: f64,
47    #[serde(serialize_with = "crate::serde_helpers::round_4")]
48    pub std_dev: f64,
49    #[serde(serialize_with = "crate::serde_helpers::round_4")]
50    pub variance: f64,
51    #[serde(
52        skip_serializing_if = "Option::is_none",
53        serialize_with = "crate::serde_helpers::round_2_opt"
54    )]
55    pub median: Option<f64>,
56    #[serde(
57        skip_serializing_if = "Option::is_none",
58        serialize_with = "crate::serde_helpers::quartiles::serialize"
59    )]
60    pub quartiles: Option<Quartiles>,
61    #[serde(
62        skip_serializing_if = "Option::is_none",
63        serialize_with = "crate::serde_helpers::round_2_opt"
64    )]
65    pub mode: Option<f64>,
66    #[serde(
67        skip_serializing_if = "Option::is_none",
68        serialize_with = "crate::serde_helpers::round_2_opt"
69    )]
70    pub coefficient_of_variation: Option<f64>,
71    #[serde(
72        skip_serializing_if = "Option::is_none",
73        serialize_with = "crate::serde_helpers::round_4_opt"
74    )]
75    pub skewness: Option<f64>,
76    #[serde(
77        skip_serializing_if = "Option::is_none",
78        serialize_with = "crate::serde_helpers::round_4_opt"
79    )]
80    pub kurtosis: Option<f64>,
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub is_approximate: Option<bool>,
83    /// Number of values flagged as IQR-based outliers in this column.
84    ///
85    /// Uses the same Tukey-style detection (Q1 − k·IQR, Q3 + k·IQR with
86    /// k = 1.5 by default) that feeds the global `accuracy.outlier_ratio`.
87    /// `None` when outlier detection didn't run (sample below the configured
88    /// minimum or non-numeric column).
89    #[serde(default, skip_serializing_if = "Option::is_none")]
90    pub outlier_count: Option<usize>,
91}
92
93impl NumericStats {
94    pub fn empty() -> Self {
95        Self {
96            min: 0.0,
97            max: 0.0,
98            mean: 0.0,
99            std_dev: 0.0,
100            variance: 0.0,
101            median: None,
102            quartiles: None,
103            mode: None,
104            coefficient_of_variation: None,
105            skewness: None,
106            kurtosis: None,
107            is_approximate: None,
108            outlier_count: None,
109        }
110    }
111}
112
113/// Statistics for text/string columns.
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct TextStats {
116    pub min_length: usize,
117    pub max_length: usize,
118    #[serde(serialize_with = "crate::serde_helpers::round_2")]
119    pub avg_length: f64,
120    #[serde(skip_serializing_if = "Option::is_none")]
121    pub most_frequent: Option<Vec<FrequencyItem>>,
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub least_frequent: Option<Vec<FrequencyItem>>,
124}
125
126impl TextStats {
127    pub fn empty() -> Self {
128        Self {
129            min_length: 0,
130            max_length: 0,
131            avg_length: 0.0,
132            most_frequent: None,
133            least_frequent: None,
134        }
135    }
136
137    pub fn from_lengths(min_length: usize, max_length: usize, avg_length: f64) -> Self {
138        Self {
139            min_length: if min_length == usize::MAX {
140                0
141            } else {
142                min_length
143            },
144            max_length,
145            avg_length,
146            most_frequent: None,
147            least_frequent: None,
148        }
149    }
150}
151
152/// Statistics for date/datetime columns.
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct DateTimeStats {
155    pub min_datetime: String,
156    pub max_datetime: String,
157    #[serde(serialize_with = "crate::serde_helpers::round_2")]
158    pub duration_days: f64,
159    pub year_distribution: HashMap<i32, usize>,
160    pub month_distribution: HashMap<u32, usize>,
161    pub day_of_week_distribution: HashMap<String, usize>,
162    #[serde(skip_serializing_if = "Option::is_none")]
163    pub hour_distribution: Option<HashMap<u32, usize>>,
164}
165
166impl DateTimeStats {
167    pub fn empty() -> Self {
168        Self {
169            min_datetime: String::new(),
170            max_datetime: String::new(),
171            duration_days: 0.0,
172            year_distribution: HashMap::new(),
173            month_distribution: HashMap::new(),
174            day_of_week_distribution: HashMap::new(),
175            hour_distribution: None,
176        }
177    }
178}
179
180/// Statistics for boolean columns.
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct BooleanStats {
183    pub true_count: usize,
184    pub false_count: usize,
185    #[serde(serialize_with = "crate::serde_helpers::round_4")]
186    pub true_ratio: f64,
187}
188
189/// Type-specific statistics for a column, determined by the inferred data type.
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub enum ColumnStats {
192    Numeric(NumericStats),
193    Text(TextStats),
194    DateTime(DateTimeStats),
195    Boolean(BooleanStats),
196    None,
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn test_column_profile_json_roundtrip() {
205        let profile = ColumnProfile {
206            name: "test_col".to_string(),
207            data_type: DataType::Integer,
208            null_count: 2,
209            total_count: 10,
210            unique_count: Some(8),
211            stats: ColumnStats::Numeric(NumericStats {
212                min: 1.0,
213                max: 100.0,
214                mean: 50.5,
215                std_dev: 28.87,
216                variance: 833.25,
217                median: Some(50.0),
218                quartiles: Some(Quartiles {
219                    q1: 25.0,
220                    q2: 50.0,
221                    q3: 75.0,
222                    iqr: 50.0,
223                }),
224                mode: Some(42.0),
225                coefficient_of_variation: Some(57.17),
226                skewness: Some(0.0),
227                kurtosis: Some(-1.2),
228                is_approximate: Some(false),
229                outlier_count: Some(0),
230            }),
231            patterns: vec![],
232        };
233
234        let json = serde_json::to_string(&profile).unwrap();
235        let deserialized: ColumnProfile = serde_json::from_str(&json).unwrap();
236
237        assert_eq!(deserialized.name, "test_col");
238        assert_eq!(deserialized.data_type, DataType::Integer);
239        assert_eq!(deserialized.total_count, 10);
240        assert_eq!(deserialized.null_count, 2);
241
242        if let ColumnStats::Numeric(n) = &deserialized.stats {
243            assert!((n.min - 1.0).abs() < 0.01);
244            assert!((n.max - 100.0).abs() < 0.01);
245            assert!((n.mean - 50.5).abs() < 0.01);
246            assert!(n.median.is_some());
247            assert!(n.quartiles.is_some());
248        } else {
249            panic!("Expected Numeric stats after roundtrip");
250        }
251    }
252
253    #[test]
254    fn test_text_stats_json_roundtrip() {
255        let profile = ColumnProfile {
256            name: "name".to_string(),
257            data_type: DataType::String,
258            null_count: 0,
259            total_count: 3,
260            unique_count: Some(3),
261            stats: ColumnStats::Text(TextStats {
262                min_length: 3,
263                max_length: 7,
264                avg_length: 5.0,
265                most_frequent: None,
266                least_frequent: None,
267            }),
268            patterns: vec![],
269        };
270
271        let json = serde_json::to_string(&profile).unwrap();
272        let deserialized: ColumnProfile = serde_json::from_str(&json).unwrap();
273
274        assert_eq!(deserialized.data_type, DataType::String);
275        if let ColumnStats::Text(t) = &deserialized.stats {
276            assert_eq!(t.min_length, 3);
277            assert_eq!(t.max_length, 7);
278        } else {
279            panic!("Expected Text stats after roundtrip");
280        }
281    }
282}