Skip to main content

xls_rs/profiling/
types.rs

1//! Types for data profiling operations
2
3use serde::{Deserialize, Serialize};
4
5/// Column profile information
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ColumnProfile {
8    pub name: String,
9    pub data_type: DataType,
10    pub null_count: usize,
11    pub null_percentage: f64,
12    pub unique_count: usize,
13    pub unique_percentage: f64,
14    pub distinct_values: Vec<String>,
15    pub top_values: Vec<ValueFrequency>,
16    pub length_stats: Option<LengthStats>,
17    pub numeric_stats: Option<NumericStats>,
18    pub date_stats: Option<DateStats>,
19    pub text_stats: Option<TextStats>,
20    pub quality_score: f64,
21}
22
23/// Data type classification
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub enum DataType {
26    String,
27    Integer,
28    Float,
29    Boolean,
30    Date,
31    DateTime,
32    Email,
33    Url,
34    Phone,
35    Unknown,
36}
37
38/// Value frequency information
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct ValueFrequency {
41    pub value: String,
42    pub count: usize,
43    pub percentage: f64,
44}
45
46/// Length statistics for text columns
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct LengthStats {
49    pub min_length: usize,
50    pub max_length: usize,
51    pub avg_length: f64,
52    pub median_length: usize,
53    pub std_dev_length: f64,
54}
55
56/// Numeric statistics
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct NumericStats {
59    pub min: f64,
60    pub max: f64,
61    pub mean: f64,
62    pub median: f64,
63    pub mode: Vec<String>,
64    pub std_dev: f64,
65    pub variance: f64,
66    pub q1: f64,
67    pub q3: f64,
68    pub iqr: f64,
69    pub skewness: f64,
70    pub kurtosis: f64,
71}
72
73/// Date statistics
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DateStats {
76    pub min_date: String,
77    pub max_date: String,
78    pub date_range_days: i64,
79    pub most_common_year: u32,
80    pub most_common_month: u32,
81    pub most_common_day_of_week: String,
82}
83
84/// Text statistics
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct TextStats {
87    pub avg_word_count: f64,
88    pub max_word_count: usize,
89    pub min_word_count: usize,
90    pub contains_numbers: bool,
91    pub contains_special_chars: bool,
92    pub all_uppercase: usize,
93    pub all_lowercase: usize,
94    pub title_case: usize,
95    pub mixed_case: usize,
96}
97
98/// Overall data profile
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct DataProfile {
101    pub file_path: String,
102    pub total_rows: usize,
103    pub total_columns: usize,
104    pub total_cells: usize,
105    pub null_cells: usize,
106    pub null_percentage: f64,
107    pub duplicate_rows: usize,
108    pub duplicate_percentage: f64,
109    pub columns: Vec<ColumnProfile>,
110    pub data_quality_score: f64,
111    pub recommendations: Vec<String>,
112    pub profiling_timestamp: String,
113}