exon_sdf/
config.rs

1// Copyright 2024 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::SchemaRef;
18use object_store::ObjectStore;
19
20/// Configuration for a SDF data source.
21#[derive(Debug)]
22pub struct SDFConfig {
23    /// The number of rows to read at a time.
24    pub batch_size: usize,
25
26    /// The schema of the file.
27    pub file_schema: SchemaRef,
28
29    /// The object store to use.
30    pub object_store: Arc<dyn ObjectStore>,
31
32    /// Any projections to apply to the resulting batches.
33    pub projection: Option<Vec<usize>>,
34
35    /// The limit of rows to read.
36    pub limit: Option<usize>,
37}
38
39impl SDFConfig {
40    pub fn new(
41        object_store: Arc<dyn ObjectStore>,
42        batch_size: usize,
43        file_schema: SchemaRef,
44    ) -> Self {
45        SDFConfig {
46            object_store,
47            batch_size,
48            file_schema,
49            projection: None,
50            limit: None,
51        }
52    }
53
54    /// Get the effective batch size, which is the minimum of the batch size
55    /// and the limit.
56    pub fn effective_batch_size(&self) -> usize {
57        self.limit
58            .map_or(self.batch_size, |limit| self.batch_size.min(limit))
59    }
60
61    /// Set the limit.
62    pub fn with_limit_opt(mut self, limit: Option<usize>) -> Self {
63        self.limit = limit;
64        self
65    }
66
67    /// Get the projection.
68    pub fn projection(&self) -> Vec<usize> {
69        self.projection
70            .clone()
71            .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
72    }
73
74    /// Get the projected schema.
75    pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
76        let schema = self.file_schema.project(&self.projection())?;
77
78        Ok(Arc::new(schema))
79    }
80
81    /// Create a new SDF configuration with a given projection.
82    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
83        // Only include fields that are in the file schema.
84        // TODO: make this cleaner, i.e. projection should probably come
85        // pre-filtered.
86        let file_projection = projection
87            .iter()
88            .filter(|f| **f < self.file_schema.fields().len())
89            .cloned()
90            .collect::<Vec<_>>();
91
92        self.projection = Some(file_projection);
93        self
94    }
95}