1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::classification::DataType;
6use crate::pattern::Pattern;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ColumnProfile {
11 pub name: String,
12 pub data_type: DataType,
13 pub null_count: usize,
14 pub total_count: usize,
15 pub unique_count: Option<usize>,
16 pub stats: ColumnStats,
17 pub patterns: Vec<Pattern>,
18}
19
20#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
22pub struct Quartiles {
23 pub q1: f64,
24 pub q2: f64,
25 pub q3: f64,
26 pub iqr: f64,
27}
28
29#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
31pub struct FrequencyItem {
32 pub value: String,
33 pub count: usize,
34 #[serde(serialize_with = "crate::serde_helpers::round_2")]
35 pub percentage: f64,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct NumericStats {
41 #[serde(serialize_with = "crate::serde_helpers::round_2")]
42 pub min: f64,
43 #[serde(serialize_with = "crate::serde_helpers::round_2")]
44 pub max: f64,
45 #[serde(serialize_with = "crate::serde_helpers::round_4")]
46 pub mean: f64,
47 #[serde(serialize_with = "crate::serde_helpers::round_4")]
48 pub std_dev: f64,
49 #[serde(serialize_with = "crate::serde_helpers::round_4")]
50 pub variance: f64,
51 #[serde(
52 skip_serializing_if = "Option::is_none",
53 serialize_with = "crate::serde_helpers::round_2_opt"
54 )]
55 pub median: Option<f64>,
56 #[serde(
57 skip_serializing_if = "Option::is_none",
58 serialize_with = "crate::serde_helpers::quartiles::serialize"
59 )]
60 pub quartiles: Option<Quartiles>,
61 #[serde(
62 skip_serializing_if = "Option::is_none",
63 serialize_with = "crate::serde_helpers::round_2_opt"
64 )]
65 pub mode: Option<f64>,
66 #[serde(
67 skip_serializing_if = "Option::is_none",
68 serialize_with = "crate::serde_helpers::round_2_opt"
69 )]
70 pub coefficient_of_variation: Option<f64>,
71 #[serde(
72 skip_serializing_if = "Option::is_none",
73 serialize_with = "crate::serde_helpers::round_4_opt"
74 )]
75 pub skewness: Option<f64>,
76 #[serde(
77 skip_serializing_if = "Option::is_none",
78 serialize_with = "crate::serde_helpers::round_4_opt"
79 )]
80 pub kurtosis: Option<f64>,
81 #[serde(skip_serializing_if = "Option::is_none")]
82 pub is_approximate: Option<bool>,
83 #[serde(default, skip_serializing_if = "Option::is_none")]
90 pub outlier_count: Option<usize>,
91}
92
93impl NumericStats {
94 pub fn empty() -> Self {
95 Self {
96 min: 0.0,
97 max: 0.0,
98 mean: 0.0,
99 std_dev: 0.0,
100 variance: 0.0,
101 median: None,
102 quartiles: None,
103 mode: None,
104 coefficient_of_variation: None,
105 skewness: None,
106 kurtosis: None,
107 is_approximate: None,
108 outlier_count: None,
109 }
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct TextStats {
116 pub min_length: usize,
117 pub max_length: usize,
118 #[serde(serialize_with = "crate::serde_helpers::round_2")]
119 pub avg_length: f64,
120 #[serde(skip_serializing_if = "Option::is_none")]
121 pub most_frequent: Option<Vec<FrequencyItem>>,
122 #[serde(skip_serializing_if = "Option::is_none")]
123 pub least_frequent: Option<Vec<FrequencyItem>>,
124}
125
126impl TextStats {
127 pub fn empty() -> Self {
128 Self {
129 min_length: 0,
130 max_length: 0,
131 avg_length: 0.0,
132 most_frequent: None,
133 least_frequent: None,
134 }
135 }
136
137 pub fn from_lengths(min_length: usize, max_length: usize, avg_length: f64) -> Self {
138 Self {
139 min_length: if min_length == usize::MAX {
140 0
141 } else {
142 min_length
143 },
144 max_length,
145 avg_length,
146 most_frequent: None,
147 least_frequent: None,
148 }
149 }
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct DateTimeStats {
155 pub min_datetime: String,
156 pub max_datetime: String,
157 #[serde(serialize_with = "crate::serde_helpers::round_2")]
158 pub duration_days: f64,
159 pub year_distribution: HashMap<i32, usize>,
160 pub month_distribution: HashMap<u32, usize>,
161 pub day_of_week_distribution: HashMap<String, usize>,
162 #[serde(skip_serializing_if = "Option::is_none")]
163 pub hour_distribution: Option<HashMap<u32, usize>>,
164}
165
166impl DateTimeStats {
167 pub fn empty() -> Self {
168 Self {
169 min_datetime: String::new(),
170 max_datetime: String::new(),
171 duration_days: 0.0,
172 year_distribution: HashMap::new(),
173 month_distribution: HashMap::new(),
174 day_of_week_distribution: HashMap::new(),
175 hour_distribution: None,
176 }
177 }
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct BooleanStats {
183 pub true_count: usize,
184 pub false_count: usize,
185 #[serde(serialize_with = "crate::serde_helpers::round_4")]
186 pub true_ratio: f64,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
191pub enum ColumnStats {
192 Numeric(NumericStats),
193 Text(TextStats),
194 DateTime(DateTimeStats),
195 Boolean(BooleanStats),
196 None,
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202
203 #[test]
204 fn test_column_profile_json_roundtrip() {
205 let profile = ColumnProfile {
206 name: "test_col".to_string(),
207 data_type: DataType::Integer,
208 null_count: 2,
209 total_count: 10,
210 unique_count: Some(8),
211 stats: ColumnStats::Numeric(NumericStats {
212 min: 1.0,
213 max: 100.0,
214 mean: 50.5,
215 std_dev: 28.87,
216 variance: 833.25,
217 median: Some(50.0),
218 quartiles: Some(Quartiles {
219 q1: 25.0,
220 q2: 50.0,
221 q3: 75.0,
222 iqr: 50.0,
223 }),
224 mode: Some(42.0),
225 coefficient_of_variation: Some(57.17),
226 skewness: Some(0.0),
227 kurtosis: Some(-1.2),
228 is_approximate: Some(false),
229 outlier_count: Some(0),
230 }),
231 patterns: vec![],
232 };
233
234 let json = serde_json::to_string(&profile).unwrap();
235 let deserialized: ColumnProfile = serde_json::from_str(&json).unwrap();
236
237 assert_eq!(deserialized.name, "test_col");
238 assert_eq!(deserialized.data_type, DataType::Integer);
239 assert_eq!(deserialized.total_count, 10);
240 assert_eq!(deserialized.null_count, 2);
241
242 if let ColumnStats::Numeric(n) = &deserialized.stats {
243 assert!((n.min - 1.0).abs() < 0.01);
244 assert!((n.max - 100.0).abs() < 0.01);
245 assert!((n.mean - 50.5).abs() < 0.01);
246 assert!(n.median.is_some());
247 assert!(n.quartiles.is_some());
248 } else {
249 panic!("Expected Numeric stats after roundtrip");
250 }
251 }
252
253 #[test]
254 fn test_text_stats_json_roundtrip() {
255 let profile = ColumnProfile {
256 name: "name".to_string(),
257 data_type: DataType::String,
258 null_count: 0,
259 total_count: 3,
260 unique_count: Some(3),
261 stats: ColumnStats::Text(TextStats {
262 min_length: 3,
263 max_length: 7,
264 avg_length: 5.0,
265 most_frequent: None,
266 least_frequent: None,
267 }),
268 patterns: vec![],
269 };
270
271 let json = serde_json::to_string(&profile).unwrap();
272 let deserialized: ColumnProfile = serde_json::from_str(&json).unwrap();
273
274 assert_eq!(deserialized.data_type, DataType::String);
275 if let ColumnStats::Text(t) = &deserialized.stats {
276 assert_eq!(t.min_length, 3);
277 assert_eq!(t.max_length, 7);
278 } else {
279 panic!("Expected Text stats after roundtrip");
280 }
281 }
282}