1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
use std::io::prelude::*;
use crate::dataframe::Data;
use crate::parsers::parse_line;
#[derive(PartialEq, Debug, Clone)]
pub enum DataType {
String,
Float,
Int,
Bool,
}
fn get_dominant_data_type(
cur_dominant_type: &DataType,
other_type: &Data,
) -> DataType {
match (cur_dominant_type, other_type) {
(_, Data::String(_)) => DataType::String,
(DataType::String, _) => DataType::String,
(_, Data::Float(_)) => DataType::Float,
(DataType::Float, _) => DataType::Float,
(_, Data::Int(_)) => DataType::Int,
(DataType::Int, _) => DataType::Int,
_ => DataType::Bool,
}
}
pub fn infer_schema<T>(reader: T) -> Vec<DataType>
where
T: BufRead,
{
let mut curr_length = 0;
let mut parsed_lines = Vec::with_capacity(501);
for (i, line) in reader.lines().enumerate() {
if i == 500 {
break;
}
let parsed = parse_line(line.unwrap().as_bytes());
if parsed == None {
continue;
};
let parsed = parsed.unwrap();
if parsed.len() > curr_length {
parsed_lines.clear();
curr_length = parsed.len();
parsed_lines.push(parsed);
} else if parsed.len() == curr_length {
parsed_lines.push(parsed);
}
}
let mut schema = Vec::with_capacity(curr_length + 1);
for i in 0..curr_length {
let mut data_type = DataType::Bool;
for row in &parsed_lines {
data_type = get_dominant_data_type(&data_type, &row[i]);
if data_type == DataType::String {
break;
}
}
schema.push(data_type);
}
schema
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn infer_schema_test() {
let input = Cursor::new(b"<1><hello><>\n<12><1.2><>");
let schema = infer_schema(input);
assert_eq!(
schema,
vec![DataType::Int, DataType::String, DataType::Bool]
);
let uses_row_w_most_fields =
Cursor::new(b"<1>\n<hello><0>\n<1.1><0><2>");
let schema2 = infer_schema(uses_row_w_most_fields);
assert_eq!(
schema2,
vec![DataType::Float, DataType::Bool, DataType::Int]
);
let type_precedence = Cursor::new(b"<0><3><3.3><str>\n<3><5.5><r><h>");
let schema3 = infer_schema(type_precedence);
assert_eq!(
schema3,
vec![
DataType::Int,
DataType::Float,
DataType::String,
DataType::String
]
);
}
}