rsv_lib/utils/
column_type.rs

1use super::{
2    cli_result::CliResult, column::Columns, reader::ExcelReader, row_split::CsvRowSplitter,
3    util::is_null,
4};
5use crate::utils::column;
6use calamine::{Data, DataType};
7use rayon::prelude::{IntoParallelIterator, ParallelIterator};
8use std::{
9    error::Error,
10    fmt::Display,
11    fs::File,
12    io::{BufRead, BufReader},
13    path::Path,
14};
15use xlsxwriter::Worksheet;
16
17#[derive(Debug)]
18pub struct ColumnTypes(Vec<CType>);
19
20#[derive(Debug)]
21pub struct CType {
22    pub col_index: usize,
23    pub col_type: ColumnType,
24    pub max_length: usize, // for excel export
25}
26
27#[derive(Debug, Clone, PartialEq)]
28pub enum ColumnType {
29    Int,
30    Float,
31    String,
32    Null,
33}
34
35impl ColumnTypes {
36    fn push(&mut self, col_index: usize, col_type: ColumnType, max_length: usize) {
37        self.0.push(CType {
38            col_index,
39            col_type,
40            max_length,
41        })
42    }
43
44    pub fn iter(&self) -> impl Iterator<Item = &CType> {
45        self.0.iter()
46    }
47
48    // parallel guess based on columns
49    pub fn guess_from_csv(
50        path: &Path,
51        sep: char,
52        quote: char,
53        no_header: bool,
54        cols: &column::Columns,
55    ) -> Result<Option<Self>, Box<dyn Error>> {
56        // reader
57        let rdr = BufReader::new(File::open(path)?).lines();
58        let lines = rdr
59            .skip(1 - no_header as usize)
60            .take(5000)
61            .filter_map(|i| i.ok())
62            .collect::<Vec<_>>();
63
64        if lines.is_empty() {
65            return Ok(None);
66        }
67
68        // split
69        let lines = lines
70            .iter()
71            .map(|r| CsvRowSplitter::new(r, sep, quote).collect::<Vec<_>>())
72            .collect::<Vec<_>>();
73
74        let guess = cols
75            .col_vec_or_length_of(lines[0].len())
76            .into_par_iter()
77            .map(|n| (n, parse_col_type_at(n, &lines), max_length_at(n, &lines)))
78            .collect::<Vec<_>>()
79            .iter()
80            .fold(ColumnTypes(vec![]), |mut a, b| {
81                a.push(b.0, b.1.clone(), b.2);
82                a
83            });
84
85        Ok(Some(guess))
86    }
87
88    // sequential guess given that excel is usually small
89    pub fn guess_from_excel(range: &ExcelReader, no_header: bool, cols: &Columns) -> Option<Self> {
90        let lines = range
91            .iter()
92            .skip(1 - no_header as usize)
93            .take(5000)
94            .collect::<Vec<_>>();
95
96        if lines.is_empty() {
97            return None;
98        }
99
100        let mut guess = ColumnTypes(vec![]);
101        for c in cols.col_vec_or_length_of(lines[0].len()) {
102            // max_length is meaningless for excel, so set default to 0
103            guess.push(c, parse_excel_col_type_at(c, &lines), 0)
104        }
105
106        Some(guess)
107    }
108
109    // sequential guess given that io is usually small
110    pub fn guess_from_io(v: &[Vec<&str>], cols: &Columns) -> Self {
111        let v = if v.len() < 5000 { v } else { &v[..5000] };
112
113        let mut guess = ColumnTypes(vec![]);
114        for c in cols.col_vec_or_length_of(v[0].len()) {
115            guess.push(c, parse_col_type_at(c, v), max_length_at(c, v))
116        }
117
118        guess
119    }
120
121    pub fn update_excel_column_width(&self, sheet: &mut Worksheet) -> CliResult {
122        for c in self.iter() {
123            sheet.set_column(
124                c.col_index as u16,
125                c.col_index as u16,
126                c.excel_col_width(),
127                None,
128            )?;
129        }
130
131        Ok(())
132    }
133}
134
135fn parse_col_type_at(n: usize, v: &[Vec<&str>]) -> ColumnType {
136    let mut ctype = ColumnType::Null;
137    for r in v {
138        if ctype.is_string() {
139            break;
140        }
141        let f = r[n];
142        if is_null(f) {
143            continue;
144        }
145        ctype.update(f);
146    }
147
148    ctype
149}
150
151fn parse_excel_col_type_at(n: usize, v: &[&[Data]]) -> ColumnType {
152    let mut ctype = ColumnType::Null;
153    for &r in v {
154        if ctype.is_string() {
155            break;
156        }
157        ctype.update_by_excel_cell(&r[n]);
158    }
159
160    ctype
161}
162
163fn max_length_at(n: usize, v: &[Vec<&str>]) -> usize {
164    v.iter().map(|r| r[n].len()).max().unwrap_or(0)
165}
166
167impl Display for ColumnType {
168    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169        match *self {
170            ColumnType::Float => f.write_str("float")?,
171            ColumnType::Int => f.write_str("int")?,
172            ColumnType::String => f.write_str("string")?,
173            ColumnType::Null => f.write_str("null")?,
174        }
175
176        Ok(())
177    }
178}
179
180impl ColumnType {
181    pub fn is_string(&self) -> bool {
182        self == &ColumnType::String
183    }
184
185    pub fn is_number(&self) -> bool {
186        self == &ColumnType::Int || self == &ColumnType::Float
187    }
188
189    pub fn update(&mut self, f: &str) {
190        match self {
191            ColumnType::Null => {
192                *self = if f.parse::<i64>().is_ok() {
193                    ColumnType::Int
194                } else if f.parse::<f64>().is_ok() {
195                    ColumnType::Float
196                } else {
197                    ColumnType::String
198                }
199            }
200            ColumnType::Int => {
201                if f.parse::<i64>().is_err() {
202                    *self = if f.parse::<f64>().is_ok() {
203                        ColumnType::Float
204                    } else {
205                        ColumnType::String
206                    }
207                }
208            }
209            ColumnType::Float => {
210                if f.parse::<f64>().is_err() {
211                    *self = ColumnType::String
212                }
213            }
214            _ => {}
215        }
216    }
217
218    pub fn update_by_excel_cell(&mut self, f: &Data) {
219        match self {
220            ColumnType::Null => {
221                *self = if f.is_int() {
222                    ColumnType::Int
223                } else if f.is_float() {
224                    ColumnType::Float
225                } else {
226                    ColumnType::String
227                };
228            }
229            ColumnType::Int => {
230                if !f.is_int() {
231                    *self = if f.is_float() {
232                        ColumnType::Float
233                    } else {
234                        ColumnType::String
235                    }
236                };
237            }
238            ColumnType::Float => {
239                if !f.is_float() {
240                    *self = ColumnType::String;
241                }
242            }
243            _ => {}
244        }
245    }
246}
247
248impl CType {
249    pub fn excel_col_width(&self) -> f64 {
250        let w = self.max_length as f64;
251        // set min-width and max-width
252        w.clamp(6.0, 60.0)
253    }
254}