1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use crate::default_typer::DefaultTyper;
use crate::errors::Result;
use crate::file;
use crate::raw_parser::{read_file_column_names, read_file_data, ParsingOptions};
use crate::schema_inference::{infer_schema, infer_separator};
use crate::typer::Typer;
use std::path::Path;
#[derive(Debug, Clone)]
pub struct Dataset<T: Typer> {
pub column_names: Option<Vec<String>>,
pub schema: Vec<T::TypeTag>,
pub data: Vec<T::TypedColumn>,
pub row_count: usize,
}
pub async fn read_file(file_path: impl AsRef<Path> + Clone) -> Result<Dataset<DefaultTyper>> {
let typer = DefaultTyper::default();
let options = ReadingOptions::default();
let ds = Dataset::read_file(file_path, options, &typer).await?;
Ok(ds)
}
impl<T: Typer> Dataset<T> {
pub async fn read_file(
file_path: impl AsRef<Path> + Clone,
options: ReadingOptions,
typer: &T,
) -> Result<Dataset<T>> {
let line_count = file::count_lines(file_path.clone()).await?;
let schema_inference_line_count = match options.schema_inference_depth {
SchemaInferenceDepth::Lines(n) => n,
SchemaInferenceDepth::Percentage(x) => (x.min(1.0) * line_count as f32).ceil() as usize,
};
let separator = match options.separator {
Separator::Value(value) => value,
Separator::Infer => infer_separator(file_path.clone()).await?,
};
let parsing_options = ParsingOptions {
text_quote: options.text_quote,
text_quote_escape: options.text_quote_escape,
separator,
};
let column_names = if options.read_header {
read_file_column_names(file_path.clone(), &parsing_options).await?
} else {
None
};
let skip_first_line = column_names.is_some();
let row_count = if skip_first_line {
line_count - 1
} else {
line_count
};
let schema = infer_schema(
file_path.clone(),
skip_first_line,
schema_inference_line_count,
&parsing_options,
T::default(),
)
.await?;
let data = read_file_data(
file_path.clone(),
&schema,
&parsing_options,
line_count,
skip_first_line,
typer,
)
.await?;
Ok(Dataset {
column_names,
schema,
row_count,
data,
})
}
}
pub type TypedDataset = Dataset<DefaultTyper>;
#[derive(Clone, Debug)]
pub enum Separator {
Value(String),
Infer,
}
#[derive(Copy, Clone, Debug)]
pub enum SchemaInferenceDepth {
Percentage(f32),
Lines(usize),
}
#[derive(Clone, Debug)]
pub struct ReadingOptions {
pub read_header: bool,
pub schema_inference_depth: SchemaInferenceDepth,
pub separator: Separator,
pub text_quote: String,
pub text_quote_escape: String,
}
impl Default for ReadingOptions {
fn default() -> Self {
ReadingOptions {
read_header: true,
schema_inference_depth: SchemaInferenceDepth::Percentage(0.01),
separator: Separator::Infer,
text_quote: "\"".to_string(),
text_quote_escape: "\\".to_string(),
}
}
}