Skip to main content

exon_bigwig/value_batch_reader/
config.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
18use exon_common::{TableSchema, DEFAULT_BATCH_SIZE};
19use noodles::core::Region;
20use object_store::ObjectStore;
21
22pub struct SchemaBuilder {
23    file_fields: Vec<Field>,
24    partition_fields: Vec<Field>,
25}
26
27impl Default for SchemaBuilder {
28    fn default() -> Self {
29        let file_fields = vec![
30            Field::new("name", DataType::Utf8, false),
31            Field::new("start", DataType::Int32, false),
32            Field::new("end", DataType::Int32, false),
33            Field::new("value", DataType::Float32, false),
34        ];
35
36        Self {
37            file_fields,
38            partition_fields: vec![],
39        }
40    }
41}
42
43impl SchemaBuilder {
44    pub fn new(file_fields: Vec<Field>, partition_fields: Vec<Field>) -> Self {
45        Self {
46            file_fields,
47            partition_fields,
48        }
49    }
50
51    pub fn add_partition_fields(&mut self, fields: Vec<Field>) {
52        self.partition_fields.extend(fields);
53    }
54
55    /// Returns the schema and the projection indexes for the file's schema
56    pub fn build(self) -> TableSchema {
57        let mut fields = self.file_fields.clone();
58        fields.extend_from_slice(&self.partition_fields);
59
60        let schema = Schema::new(fields);
61
62        let projection = (0..self.file_fields.len()).collect::<Vec<_>>();
63
64        TableSchema::new(Arc::new(schema), projection)
65    }
66}
67
68#[derive(Debug)]
69pub enum ValueReadType {
70    Interval(Region),
71    Scan,
72}
73
74/// Configuration for a BigWig datasource.
75#[derive(Debug)]
76pub struct BigWigValueConfig {
77    /// The number of records to read at a time.
78    pub batch_size: usize,
79
80    /// The schema of the BigWig file.
81    pub file_schema: SchemaRef,
82
83    /// The object store to use.
84    pub object_store: Arc<dyn ObjectStore>,
85
86    /// Any projections to apply to the resulting batches.
87    pub projection: Option<Vec<usize>>,
88
89    /// The type of read to perform.
90    pub read_type: ValueReadType,
91}
92
93impl BigWigValueConfig {
94    /// Create a new BigWig configuration.
95    pub fn new(object_store: Arc<dyn ObjectStore>) -> Self {
96        let file_schema = Schema::new(Fields::from_iter(vec![
97            Field::new("chrom", DataType::Utf8, false),
98            Field::new("start", DataType::Int32, false),
99            Field::new("end", DataType::Int32, false),
100            Field::new("value", DataType::Float32, false),
101        ]));
102
103        Self {
104            batch_size: DEFAULT_BATCH_SIZE,
105            object_store,
106            file_schema: Arc::new(file_schema),
107            projection: None,
108            read_type: ValueReadType::Scan,
109        }
110    }
111
112    /// Create a new BigWig configuration.
113    pub fn new_with_schema(object_store: Arc<dyn ObjectStore>, file_schema: SchemaRef) -> Self {
114        Self {
115            batch_size: DEFAULT_BATCH_SIZE,
116            object_store,
117            file_schema,
118            projection: None,
119            read_type: ValueReadType::Scan,
120        }
121    }
122
123    /// Set the read type to interval.
124    pub fn with_some_interval(mut self, interval: Option<Region>) -> Self {
125        if let Some(interval) = interval {
126            self.read_type = ValueReadType::Interval(interval);
127        } else {
128            self.read_type = ValueReadType::Scan;
129        }
130
131        self
132    }
133
134    /// Set the batch size.
135    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
136        self.batch_size = batch_size;
137        self
138    }
139
140    /// Set the projection.
141    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
142        self.projection = Some(projection);
143        self
144    }
145
146    /// Set the projection from an optional vector.
147    pub fn with_some_projection(mut self, projection: Option<Vec<usize>>) -> Self {
148        self.projection = projection;
149        self
150    }
151}