1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
use std::fs::File;
use std::rc::Rc;
use std::sync::Arc;
use arrow::csv;
use arrow::datatypes::Schema;
use arrow::record_batch::RecordBatch;
use super::error::Result;
pub trait DataSource {
fn schema(&self) -> &Arc<Schema>;
fn next(&mut self) -> Result<Option<RecordBatch>>;
}
pub struct CsvDataSource {
schema: Arc<Schema>,
reader: csv::Reader,
}
impl CsvDataSource {
pub fn new(filename: &str, schema: Arc<Schema>, batch_size: usize) -> Self {
let file = File::open(filename).unwrap();
let reader = csv::Reader::new(file, schema.clone(), true, batch_size, None);
Self { schema, reader }
}
pub fn from_reader(schema: Arc<Schema>, reader: csv::Reader) -> Self {
Self { schema, reader }
}
}
impl DataSource for CsvDataSource {
fn schema(&self) -> &Arc<Schema> {
&self.schema
}
fn next(&mut self) -> Result<Option<RecordBatch>> {
Ok(self.reader.next()?)
}
}
#[derive(Serialize, Deserialize, Clone)]
pub enum DataSourceMeta {
CsvFile {
filename: String,
schema: Rc<Schema>,
has_header: bool,
projection: Option<Vec<usize>>,
},
ParquetFile {
filename: String,
schema: Rc<Schema>,
projection: Option<Vec<usize>>,
},
}