rsv_lib/utils/
column_type.rs1use super::{
2 cli_result::CliResult, column::Columns, reader::ExcelReader, row_split::CsvRowSplitter,
3 util::is_null,
4};
5use crate::utils::column;
6use calamine::{Data, DataType};
7use rayon::prelude::{IntoParallelIterator, ParallelIterator};
8use std::{
9 error::Error,
10 fmt::Display,
11 fs::File,
12 io::{BufRead, BufReader},
13 path::Path,
14};
15use xlsxwriter::Worksheet;
16
17#[derive(Debug)]
18pub struct ColumnTypes(Vec<CType>);
19
20#[derive(Debug)]
21pub struct CType {
22 pub col_index: usize,
23 pub col_type: ColumnType,
24 pub max_length: usize, }
26
27#[derive(Debug, Clone, PartialEq)]
28pub enum ColumnType {
29 Int,
30 Float,
31 String,
32 Null,
33}
34
35impl ColumnTypes {
36 fn push(&mut self, col_index: usize, col_type: ColumnType, max_length: usize) {
37 self.0.push(CType {
38 col_index,
39 col_type,
40 max_length,
41 })
42 }
43
44 pub fn iter(&self) -> impl Iterator<Item = &CType> {
45 self.0.iter()
46 }
47
48 pub fn guess_from_csv(
50 path: &Path,
51 sep: char,
52 quote: char,
53 no_header: bool,
54 cols: &column::Columns,
55 ) -> Result<Option<Self>, Box<dyn Error>> {
56 let rdr = BufReader::new(File::open(path)?).lines();
58 let lines = rdr
59 .skip(1 - no_header as usize)
60 .take(5000)
61 .filter_map(|i| i.ok())
62 .collect::<Vec<_>>();
63
64 if lines.is_empty() {
65 return Ok(None);
66 }
67
68 let lines = lines
70 .iter()
71 .map(|r| CsvRowSplitter::new(r, sep, quote).collect::<Vec<_>>())
72 .collect::<Vec<_>>();
73
74 let guess = cols
75 .col_vec_or_length_of(lines[0].len())
76 .into_par_iter()
77 .map(|n| (n, parse_col_type_at(n, &lines), max_length_at(n, &lines)))
78 .collect::<Vec<_>>()
79 .iter()
80 .fold(ColumnTypes(vec![]), |mut a, b| {
81 a.push(b.0, b.1.clone(), b.2);
82 a
83 });
84
85 Ok(Some(guess))
86 }
87
88 pub fn guess_from_excel(range: &ExcelReader, no_header: bool, cols: &Columns) -> Option<Self> {
90 let lines = range
91 .iter()
92 .skip(1 - no_header as usize)
93 .take(5000)
94 .collect::<Vec<_>>();
95
96 if lines.is_empty() {
97 return None;
98 }
99
100 let mut guess = ColumnTypes(vec![]);
101 for c in cols.col_vec_or_length_of(lines[0].len()) {
102 guess.push(c, parse_excel_col_type_at(c, &lines), 0)
104 }
105
106 Some(guess)
107 }
108
109 pub fn guess_from_io(v: &[Vec<&str>], cols: &Columns) -> Self {
111 let v = if v.len() < 5000 { v } else { &v[..5000] };
112
113 let mut guess = ColumnTypes(vec![]);
114 for c in cols.col_vec_or_length_of(v[0].len()) {
115 guess.push(c, parse_col_type_at(c, v), max_length_at(c, v))
116 }
117
118 guess
119 }
120
121 pub fn update_excel_column_width(&self, sheet: &mut Worksheet) -> CliResult {
122 for c in self.iter() {
123 sheet.set_column(
124 c.col_index as u16,
125 c.col_index as u16,
126 c.excel_col_width(),
127 None,
128 )?;
129 }
130
131 Ok(())
132 }
133}
134
135fn parse_col_type_at(n: usize, v: &[Vec<&str>]) -> ColumnType {
136 let mut ctype = ColumnType::Null;
137 for r in v {
138 if ctype.is_string() {
139 break;
140 }
141 let f = r[n];
142 if is_null(f) {
143 continue;
144 }
145 ctype.update(f);
146 }
147
148 ctype
149}
150
151fn parse_excel_col_type_at(n: usize, v: &[&[Data]]) -> ColumnType {
152 let mut ctype = ColumnType::Null;
153 for &r in v {
154 if ctype.is_string() {
155 break;
156 }
157 ctype.update_by_excel_cell(&r[n]);
158 }
159
160 ctype
161}
162
163fn max_length_at(n: usize, v: &[Vec<&str>]) -> usize {
164 v.iter().map(|r| r[n].len()).max().unwrap_or(0)
165}
166
167impl Display for ColumnType {
168 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169 match *self {
170 ColumnType::Float => f.write_str("float")?,
171 ColumnType::Int => f.write_str("int")?,
172 ColumnType::String => f.write_str("string")?,
173 ColumnType::Null => f.write_str("null")?,
174 }
175
176 Ok(())
177 }
178}
179
180impl ColumnType {
181 pub fn is_string(&self) -> bool {
182 self == &ColumnType::String
183 }
184
185 pub fn is_number(&self) -> bool {
186 self == &ColumnType::Int || self == &ColumnType::Float
187 }
188
189 pub fn update(&mut self, f: &str) {
190 match self {
191 ColumnType::Null => {
192 *self = if f.parse::<i64>().is_ok() {
193 ColumnType::Int
194 } else if f.parse::<f64>().is_ok() {
195 ColumnType::Float
196 } else {
197 ColumnType::String
198 }
199 }
200 ColumnType::Int => {
201 if f.parse::<i64>().is_err() {
202 *self = if f.parse::<f64>().is_ok() {
203 ColumnType::Float
204 } else {
205 ColumnType::String
206 }
207 }
208 }
209 ColumnType::Float => {
210 if f.parse::<f64>().is_err() {
211 *self = ColumnType::String
212 }
213 }
214 _ => {}
215 }
216 }
217
218 pub fn update_by_excel_cell(&mut self, f: &Data) {
219 match self {
220 ColumnType::Null => {
221 *self = if f.is_int() {
222 ColumnType::Int
223 } else if f.is_float() {
224 ColumnType::Float
225 } else {
226 ColumnType::String
227 };
228 }
229 ColumnType::Int => {
230 if !f.is_int() {
231 *self = if f.is_float() {
232 ColumnType::Float
233 } else {
234 ColumnType::String
235 }
236 };
237 }
238 ColumnType::Float => {
239 if !f.is_float() {
240 *self = ColumnType::String;
241 }
242 }
243 _ => {}
244 }
245 }
246}
247
248impl CType {
249 pub fn excel_col_width(&self) -> f64 {
250 let w = self.max_length as f64;
251 w.clamp(6.0, 60.0)
253 }
254}