rsv_lib/utils/
column_stats.rs

1use super::{
2    column_type::{ColumnType, ColumnTypes},
3    row_split::CsvRowSplitter,
4    util,
5};
6use ahash::HashSet;
7use calamine::Data;
8use rayon::prelude::*;
9use std::fmt::Display;
10use tabled::{builder::Builder, settings::Style, Table};
11
12#[derive(Debug)]
13pub struct ColumnStats {
14    max_col: usize,
15    cols: Vec<usize>,
16    pub stat: Vec<CStat>,
17    pub rows: usize,
18}
19
20#[derive(Debug)]
21pub struct CStat {
22    col_index: usize,
23    col_type: ColumnType,
24    name: String,
25    min: f64,
26    max: f64,
27    min_string: String,
28    max_string: String,
29    mean: f64,
30    unique: usize,
31    null: usize,
32    total: f64,
33    unique_hashset: HashSet<String>,
34}
35
36impl ColumnStats {
37    pub fn new(col_type: &ColumnTypes, col_name: &[String]) -> Self {
38        let mut s = ColumnStats {
39            max_col: 0,
40            cols: vec![],
41            stat: vec![],
42            rows: 0,
43        };
44        col_type
45            .iter()
46            .for_each(|c| s.push(c.col_index, c.col_type.clone(), &col_name[c.col_index]));
47
48        s
49    }
50
51    fn push(&mut self, col_index: usize, col_type: ColumnType, name: &str) {
52        let stat = CStat {
53            col_index,
54            col_type,
55            name: name.to_owned(),
56            min: f64::MAX,
57            max: f64::MIN,
58            min_string: String::new(),
59            max_string: String::new(),
60            mean: 0.0,
61            total: 0.0,
62            unique: 0,
63            null: 0,
64            unique_hashset: HashSet::default(),
65        };
66        self.cols.push(col_index);
67        self.stat.push(stat);
68
69        if col_index > self.max_col {
70            self.max_col = col_index
71        }
72    }
73
74    pub fn parse_line_by_fields(&mut self, v: &[&str]) {
75        if self.max_col >= v.len() {
76            println!("[info] ignore a bad line: {v:?}");
77            return;
78        }
79
80        self.cols
81            .iter()
82            .zip(&mut self.stat)
83            .for_each(|(&i, c)| c.parse(v[i]));
84
85        self.rows += 1;
86    }
87
88    pub fn parse_line(&mut self, line: &str, sep: char, quote: char) {
89        let v = CsvRowSplitter::new(line, sep, quote).collect::<Vec<_>>();
90        self.parse_line_by_fields(&v);
91    }
92
93    pub fn parse_excel_row(&mut self, v: &[Data]) {
94        if self.max_col >= v.len() {
95            println!("[info] ignore a bad line: {v:?}");
96            return;
97        }
98
99        self.cols.iter().zip(&mut self.stat).for_each(|(&i, c)| {
100            let t = &v[i];
101            match t {
102                Data::String(v) => c.parse(v),
103                _ => c.parse(&t.to_string()),
104            };
105        });
106
107        self.rows += 1;
108    }
109
110    pub fn cal_unique_and_mean(&mut self) {
111        self.stat.iter_mut().for_each(|s| {
112            s.unique = s.unique_hashset.len();
113
114            match s.col_type {
115                ColumnType::Float | ColumnType::Int => {
116                    let n = self.rows - s.null;
117                    if n != 0 {
118                        s.mean = s.total / n as f64;
119                    }
120                }
121                _ => {}
122            }
123        })
124    }
125
126    fn iter(&self) -> impl Iterator<Item = &CStat> {
127        self.stat.iter()
128    }
129
130    pub fn merge(&mut self, other: ColumnStats) {
131        self.rows += other.rows;
132
133        // parallel update
134        other
135            .stat
136            .into_par_iter()
137            .zip(&mut self.stat)
138            .for_each(|(o, c)| c.merge(o));
139    }
140
141    fn print_table_vertical(&self) -> Table {
142        let mut builder = Builder::default();
143
144        // header
145        let r = ["col", "type", "min", "max", "mean", "unique", "null"];
146        builder.push_record(r);
147
148        // columns
149        self.iter().for_each(|c| {
150            let mut r = vec![];
151            r.push(c.name.to_owned());
152            r.push(format!("{}", c.col_type));
153            r.push(c.min_fmt());
154            r.push(c.max_fmt());
155            r.push(c.mean_fmt());
156            r.push(c.unique_fmt());
157            r.push(c.null.to_string());
158            builder.push_record(r);
159        });
160
161        // build
162        let mut table = builder.build();
163
164        // style
165        table.with(Style::sharp());
166
167        table
168    }
169
170    pub fn print(&self) {
171        let table = self.print_table_vertical();
172        println!("{table}");
173    }
174}
175
176impl Clone for ColumnStats {
177    fn clone(&self) -> Self {
178        let mut o = ColumnStats {
179            max_col: self.max_col,
180            stat: vec![],
181            cols: vec![],
182            rows: 0,
183        };
184
185        self.iter()
186            .for_each(|c| o.push(c.col_index, c.col_type.clone(), &c.name));
187
188        o
189    }
190}
191
192impl Display for ColumnStats {
193    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
194        let t = self.print_table_vertical().to_string();
195        f.write_str(&t)?;
196
197        Ok(())
198    }
199}
200
201impl CStat {
202    pub fn parse(&mut self, f: &str) {
203        if util::is_null(f) {
204            self.null += 1;
205            return;
206        }
207        match self.col_type {
208            ColumnType::Int => {
209                if let Ok(v) = f.parse::<i64>() {
210                    self.update_number_stat(v as f64)
211                } else if let Ok(v) = f.parse::<f64>() {
212                    self.set_as_float();
213                    self.update_number_stat(v)
214                } else {
215                    self.set_as_string();
216                    self.update_string_stat(f)
217                }
218            }
219            ColumnType::Float => {
220                if let Ok(v) = f.parse::<f64>() {
221                    self.update_number_stat(v)
222                } else {
223                    self.set_as_string();
224                    self.update_string_stat(f)
225                }
226            }
227            ColumnType::String => self.update_string_stat(f),
228            ColumnType::Null => {}
229        }
230        // ignore unique for FLOAT type
231        if !self.is_float() {
232            self.insert_unique(f);
233        }
234    }
235
236    fn insert_unique(&mut self, v: &str) {
237        // quicker compared with no check
238        if !self.unique_hashset.contains(v) {
239            self.unique_hashset.insert(v.to_owned());
240        }
241    }
242
243    fn set_as_float(&mut self) {
244        self.col_type = ColumnType::Float
245    }
246
247    fn set_as_string(&mut self) {
248        self.col_type = ColumnType::String
249    }
250
251    fn is_int(&self) -> bool {
252        self.col_type == ColumnType::Int
253    }
254
255    fn is_float(&self) -> bool {
256        self.col_type == ColumnType::Float
257    }
258
259    fn is_string(&self) -> bool {
260        self.col_type == ColumnType::String
261    }
262
263    fn update_number_stat(&mut self, v: f64) {
264        if v > self.max {
265            self.max = v
266        }
267        if v < self.min {
268            self.min = v
269        }
270        self.total += v;
271    }
272
273    fn update_string_stat(&mut self, v: &str) {
274        if self.min_string.is_empty() || v < &self.min_string {
275            self.min_string = v.to_owned();
276        }
277        if v > &self.max_string {
278            self.max_string = v.to_owned();
279        }
280    }
281
282    fn merge(&mut self, o: CStat) {
283        if self.col_type != o.col_type {
284            self.col_type = o.col_type
285        }
286        if o.min < self.min {
287            self.min = o.min;
288        }
289        if o.max > self.max {
290            self.max = o.max
291        }
292        if self.min_string.is_empty() || o.min_string < self.min_string {
293            self.min_string = o.min_string
294        }
295        if o.max_string > self.max_string {
296            self.max_string = o.max_string
297        }
298        self.null += o.null;
299        self.total += o.total;
300        self.unique_hashset.extend(o.unique_hashset)
301    }
302
303    fn mean_fmt(&self) -> String {
304        if self.is_string() {
305            "-".to_owned()
306        } else {
307            format!("{:.2}", self.mean)
308        }
309    }
310
311    fn min_fmt(&self) -> String {
312        if self.is_string() {
313            self.min_string.to_owned()
314        } else if self.is_int() {
315            format!("{:.0}", if self.min == f64::MAX { 0.0 } else { self.min })
316        } else {
317            format!("{:.2}", if self.min == f64::MAX { 0.0 } else { self.min })
318        }
319    }
320
321    fn max_fmt(&self) -> String {
322        if self.is_string() {
323            self.max_string.to_owned()
324        } else if self.is_int() {
325            format!("{:.0}", if self.max == f64::MIN { 0.0 } else { self.max })
326        } else {
327            format!("{:.2}", if self.max == f64::MIN { 0.0 } else { self.max })
328        }
329    }
330
331    fn unique_fmt(&self) -> String {
332        if self.is_float() {
333            "-".to_owned()
334        } else {
335            self.unique.to_string()
336        }
337    }
338}