exon_sdf/config.rs
1// Copyright 2024 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::SchemaRef;
18use object_store::ObjectStore;
19
20/// Configuration for a SDF data source.
21#[derive(Debug)]
22pub struct SDFConfig {
23 /// The number of rows to read at a time.
24 pub batch_size: usize,
25
26 /// The schema of the file.
27 pub file_schema: SchemaRef,
28
29 /// The object store to use.
30 pub object_store: Arc<dyn ObjectStore>,
31
32 /// Any projections to apply to the resulting batches.
33 pub projection: Option<Vec<usize>>,
34
35 /// The limit of rows to read.
36 pub limit: Option<usize>,
37}
38
39impl SDFConfig {
40 pub fn new(
41 object_store: Arc<dyn ObjectStore>,
42 batch_size: usize,
43 file_schema: SchemaRef,
44 ) -> Self {
45 SDFConfig {
46 object_store,
47 batch_size,
48 file_schema,
49 projection: None,
50 limit: None,
51 }
52 }
53
54 /// Get the effective batch size, which is the minimum of the batch size
55 /// and the limit.
56 pub fn effective_batch_size(&self) -> usize {
57 self.limit
58 .map_or(self.batch_size, |limit| self.batch_size.min(limit))
59 }
60
61 /// Set the limit.
62 pub fn with_limit_opt(mut self, limit: Option<usize>) -> Self {
63 self.limit = limit;
64 self
65 }
66
67 /// Get the projection.
68 pub fn projection(&self) -> Vec<usize> {
69 self.projection
70 .clone()
71 .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
72 }
73
74 /// Get the projected schema.
75 pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
76 let schema = self.file_schema.project(&self.projection())?;
77
78 Ok(Arc::new(schema))
79 }
80
81 /// Create a new SDF configuration with a given projection.
82 pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
83 // Only include fields that are in the file schema.
84 // TODO: make this cleaner, i.e. projection should probably come
85 // pre-filtered.
86 let file_projection = projection
87 .iter()
88 .filter(|f| **f < self.file_schema.fields().len())
89 .cloned()
90 .collect::<Vec<_>>();
91
92 self.projection = Some(file_projection);
93 self
94 }
95}