rsv_lib/utils/
column_stats.rs1use super::{
2 column_type::{ColumnType, ColumnTypes},
3 row_split::CsvRowSplitter,
4 util,
5};
6use ahash::HashSet;
7use calamine::Data;
8use rayon::prelude::*;
9use std::fmt::Display;
10use tabled::{builder::Builder, settings::Style, Table};
11
12#[derive(Debug)]
13pub struct ColumnStats {
14 max_col: usize,
15 cols: Vec<usize>,
16 pub stat: Vec<CStat>,
17 pub rows: usize,
18}
19
20#[derive(Debug)]
21pub struct CStat {
22 col_index: usize,
23 col_type: ColumnType,
24 name: String,
25 min: f64,
26 max: f64,
27 min_string: String,
28 max_string: String,
29 mean: f64,
30 unique: usize,
31 null: usize,
32 total: f64,
33 unique_hashset: HashSet<String>,
34}
35
36impl ColumnStats {
37 pub fn new(col_type: &ColumnTypes, col_name: &[String]) -> Self {
38 let mut s = ColumnStats {
39 max_col: 0,
40 cols: vec![],
41 stat: vec![],
42 rows: 0,
43 };
44 col_type
45 .iter()
46 .for_each(|c| s.push(c.col_index, c.col_type.clone(), &col_name[c.col_index]));
47
48 s
49 }
50
51 fn push(&mut self, col_index: usize, col_type: ColumnType, name: &str) {
52 let stat = CStat {
53 col_index,
54 col_type,
55 name: name.to_owned(),
56 min: f64::MAX,
57 max: f64::MIN,
58 min_string: String::new(),
59 max_string: String::new(),
60 mean: 0.0,
61 total: 0.0,
62 unique: 0,
63 null: 0,
64 unique_hashset: HashSet::default(),
65 };
66 self.cols.push(col_index);
67 self.stat.push(stat);
68
69 if col_index > self.max_col {
70 self.max_col = col_index
71 }
72 }
73
74 pub fn parse_line_by_fields(&mut self, v: &[&str]) {
75 if self.max_col >= v.len() {
76 println!("[info] ignore a bad line: {v:?}");
77 return;
78 }
79
80 self.cols
81 .iter()
82 .zip(&mut self.stat)
83 .for_each(|(&i, c)| c.parse(v[i]));
84
85 self.rows += 1;
86 }
87
88 pub fn parse_line(&mut self, line: &str, sep: char, quote: char) {
89 let v = CsvRowSplitter::new(line, sep, quote).collect::<Vec<_>>();
90 self.parse_line_by_fields(&v);
91 }
92
93 pub fn parse_excel_row(&mut self, v: &[Data]) {
94 if self.max_col >= v.len() {
95 println!("[info] ignore a bad line: {v:?}");
96 return;
97 }
98
99 self.cols.iter().zip(&mut self.stat).for_each(|(&i, c)| {
100 let t = &v[i];
101 match t {
102 Data::String(v) => c.parse(v),
103 _ => c.parse(&t.to_string()),
104 };
105 });
106
107 self.rows += 1;
108 }
109
110 pub fn cal_unique_and_mean(&mut self) {
111 self.stat.iter_mut().for_each(|s| {
112 s.unique = s.unique_hashset.len();
113
114 match s.col_type {
115 ColumnType::Float | ColumnType::Int => {
116 let n = self.rows - s.null;
117 if n != 0 {
118 s.mean = s.total / n as f64;
119 }
120 }
121 _ => {}
122 }
123 })
124 }
125
126 fn iter(&self) -> impl Iterator<Item = &CStat> {
127 self.stat.iter()
128 }
129
130 pub fn merge(&mut self, other: ColumnStats) {
131 self.rows += other.rows;
132
133 other
135 .stat
136 .into_par_iter()
137 .zip(&mut self.stat)
138 .for_each(|(o, c)| c.merge(o));
139 }
140
141 fn print_table_vertical(&self) -> Table {
142 let mut builder = Builder::default();
143
144 let r = ["col", "type", "min", "max", "mean", "unique", "null"];
146 builder.push_record(r);
147
148 self.iter().for_each(|c| {
150 let mut r = vec![];
151 r.push(c.name.to_owned());
152 r.push(format!("{}", c.col_type));
153 r.push(c.min_fmt());
154 r.push(c.max_fmt());
155 r.push(c.mean_fmt());
156 r.push(c.unique_fmt());
157 r.push(c.null.to_string());
158 builder.push_record(r);
159 });
160
161 let mut table = builder.build();
163
164 table.with(Style::sharp());
166
167 table
168 }
169
170 pub fn print(&self) {
171 let table = self.print_table_vertical();
172 println!("{table}");
173 }
174}
175
176impl Clone for ColumnStats {
177 fn clone(&self) -> Self {
178 let mut o = ColumnStats {
179 max_col: self.max_col,
180 stat: vec![],
181 cols: vec![],
182 rows: 0,
183 };
184
185 self.iter()
186 .for_each(|c| o.push(c.col_index, c.col_type.clone(), &c.name));
187
188 o
189 }
190}
191
192impl Display for ColumnStats {
193 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
194 let t = self.print_table_vertical().to_string();
195 f.write_str(&t)?;
196
197 Ok(())
198 }
199}
200
201impl CStat {
202 pub fn parse(&mut self, f: &str) {
203 if util::is_null(f) {
204 self.null += 1;
205 return;
206 }
207 match self.col_type {
208 ColumnType::Int => {
209 if let Ok(v) = f.parse::<i64>() {
210 self.update_number_stat(v as f64)
211 } else if let Ok(v) = f.parse::<f64>() {
212 self.set_as_float();
213 self.update_number_stat(v)
214 } else {
215 self.set_as_string();
216 self.update_string_stat(f)
217 }
218 }
219 ColumnType::Float => {
220 if let Ok(v) = f.parse::<f64>() {
221 self.update_number_stat(v)
222 } else {
223 self.set_as_string();
224 self.update_string_stat(f)
225 }
226 }
227 ColumnType::String => self.update_string_stat(f),
228 ColumnType::Null => {}
229 }
230 if !self.is_float() {
232 self.insert_unique(f);
233 }
234 }
235
236 fn insert_unique(&mut self, v: &str) {
237 if !self.unique_hashset.contains(v) {
239 self.unique_hashset.insert(v.to_owned());
240 }
241 }
242
243 fn set_as_float(&mut self) {
244 self.col_type = ColumnType::Float
245 }
246
247 fn set_as_string(&mut self) {
248 self.col_type = ColumnType::String
249 }
250
251 fn is_int(&self) -> bool {
252 self.col_type == ColumnType::Int
253 }
254
255 fn is_float(&self) -> bool {
256 self.col_type == ColumnType::Float
257 }
258
259 fn is_string(&self) -> bool {
260 self.col_type == ColumnType::String
261 }
262
263 fn update_number_stat(&mut self, v: f64) {
264 if v > self.max {
265 self.max = v
266 }
267 if v < self.min {
268 self.min = v
269 }
270 self.total += v;
271 }
272
273 fn update_string_stat(&mut self, v: &str) {
274 if self.min_string.is_empty() || v < &self.min_string {
275 self.min_string = v.to_owned();
276 }
277 if v > &self.max_string {
278 self.max_string = v.to_owned();
279 }
280 }
281
282 fn merge(&mut self, o: CStat) {
283 if self.col_type != o.col_type {
284 self.col_type = o.col_type
285 }
286 if o.min < self.min {
287 self.min = o.min;
288 }
289 if o.max > self.max {
290 self.max = o.max
291 }
292 if self.min_string.is_empty() || o.min_string < self.min_string {
293 self.min_string = o.min_string
294 }
295 if o.max_string > self.max_string {
296 self.max_string = o.max_string
297 }
298 self.null += o.null;
299 self.total += o.total;
300 self.unique_hashset.extend(o.unique_hashset)
301 }
302
303 fn mean_fmt(&self) -> String {
304 if self.is_string() {
305 "-".to_owned()
306 } else {
307 format!("{:.2}", self.mean)
308 }
309 }
310
311 fn min_fmt(&self) -> String {
312 if self.is_string() {
313 self.min_string.to_owned()
314 } else if self.is_int() {
315 format!("{:.0}", if self.min == f64::MAX { 0.0 } else { self.min })
316 } else {
317 format!("{:.2}", if self.min == f64::MAX { 0.0 } else { self.min })
318 }
319 }
320
321 fn max_fmt(&self) -> String {
322 if self.is_string() {
323 self.max_string.to_owned()
324 } else if self.is_int() {
325 format!("{:.0}", if self.max == f64::MIN { 0.0 } else { self.max })
326 } else {
327 format!("{:.2}", if self.max == f64::MIN { 0.0 } else { self.max })
328 }
329 }
330
331 fn unique_fmt(&self) -> String {
332 if self.is_float() {
333 "-".to_owned()
334 } else {
335 self.unique.to_string()
336 }
337 }
338}