exon_bigwig/zoom_batch_reader/
config.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::{
18    datatypes::{DataType, Field, Schema, SchemaRef},
19    error::Result as ArrowResult,
20};
21use exon_common::{TableSchema, DEFAULT_BATCH_SIZE};
22use noodles::core::Region;
23use object_store::ObjectStore;
24
25pub struct SchemaBuilder {
26    file_fields: Vec<Field>,
27    partition_fields: Vec<Field>,
28}
29
30impl Default for SchemaBuilder {
31    fn default() -> Self {
32        let file_fields = vec![
33            Field::new("name", DataType::Utf8, false),
34            Field::new("start", DataType::Int32, false),
35            Field::new("end", DataType::Int32, false),
36            Field::new("total_items", DataType::Int32, false),
37            Field::new("bases_covered", DataType::Int32, false),
38            Field::new("max_value", DataType::Float64, false),
39            Field::new("min_value", DataType::Float64, false),
40            Field::new("sum_squares", DataType::Float64, false),
41            Field::new("sum", DataType::Float64, false),
42        ];
43
44        Self {
45            file_fields,
46            partition_fields: vec![],
47        }
48    }
49}
50
51impl SchemaBuilder {
52    pub fn new(file_fields: Vec<Field>, partition_fields: Vec<Field>) -> Self {
53        Self {
54            file_fields,
55            partition_fields,
56        }
57    }
58
59    pub fn add_partition_fields(&mut self, fields: Vec<Field>) {
60        self.partition_fields.extend(fields);
61    }
62
63    /// Returns the schema and the projection indexes for the file's schema
64    pub fn build(self) -> TableSchema {
65        let mut fields = self.file_fields.clone();
66        fields.extend_from_slice(&self.partition_fields);
67
68        let schema = Schema::new(fields);
69
70        let projection = (0..self.file_fields.len()).collect::<Vec<_>>();
71
72        TableSchema::new(Arc::new(schema), projection)
73    }
74}
75
76/// Configuration for a BigWig datasource.
77#[derive(Debug)]
78pub struct BigWigZoomConfig {
79    /// The number of records to read at a time.
80    pub batch_size: usize,
81
82    /// The schema of the BigWig file.
83    pub file_schema: SchemaRef,
84
85    /// The object store to use.
86    pub object_store: Arc<dyn ObjectStore>,
87
88    /// Any projections to apply to the resulting batches.
89    pub projection: Option<Vec<usize>>,
90
91    /// The interval to read.
92    pub interval: Option<Region>,
93
94    /// The reduction to apply.
95    pub reduction_level: u32,
96}
97
98impl BigWigZoomConfig {
99    /// Create a new BigWig configuration.
100    pub fn new_with_schema(object_store: Arc<dyn ObjectStore>, file_schema: SchemaRef) -> Self {
101        Self {
102            batch_size: DEFAULT_BATCH_SIZE,
103            object_store,
104            file_schema,
105            projection: None,
106            interval: None,
107            reduction_level: 400,
108        }
109    }
110
111    pub fn new(object_store: Arc<dyn ObjectStore>) -> ArrowResult<Self> {
112        let schema = SchemaBuilder::default().build();
113        let file_schema = schema.file_schema()?;
114
115        Ok(Self::new_with_schema(object_store, file_schema))
116    }
117
118    /// Get the reduction level.
119    pub fn reduction_level(&self) -> u32 {
120        self.reduction_level
121    }
122
123    /// Get the interval.
124    pub fn interval(&self) -> Option<&Region> {
125        self.interval.as_ref()
126    }
127
128    /// Set the reduction level.
129    pub fn with_reduction_level(mut self, reduction_level: u32) -> Self {
130        self.reduction_level = reduction_level;
131        self
132    }
133
134    /// Set the interval.
135    pub fn with_interval(mut self, interval: Region) -> Self {
136        self.interval = Some(interval);
137        self
138    }
139
140    /// Set the interval from an optional region.
141    pub fn with_some_interval(mut self, interval: Option<Region>) -> Self {
142        self.interval = interval;
143        self
144    }
145
146    /// Set the batch size.
147    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
148        self.batch_size = batch_size;
149        self
150    }
151
152    /// Set the projection.
153    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
154        self.projection = Some(projection);
155        self
156    }
157
158    /// Set the projection from an optional vector.
159    pub fn with_some_projection(mut self, projection: Option<Vec<usize>>) -> Self {
160        self.projection = projection;
161        self
162    }
163}