Skip to main content

holodeck_lib/
sequence_dict.rs

1use std::collections::HashMap;
2use std::ops::Index;
3
4use noodles::fasta;
5
6/// Metadata for a single reference sequence (contig).
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct SequenceMetadata {
9    /// 0-based positional index.
10    index: usize,
11    /// Contig name.
12    name: String,
13    /// Contig length in bases.
14    length: usize,
15}
16
17impl SequenceMetadata {
18    /// Create a new `SequenceMetadata` with the given index, name, and length.
19    #[must_use]
20    pub fn new(index: usize, name: String, length: usize) -> Self {
21        Self { index, name, length }
22    }
23
24    /// Return the 0-based index of this sequence.
25    #[must_use]
26    pub fn index(&self) -> usize {
27        self.index
28    }
29
30    /// Return the name of this sequence.
31    #[must_use]
32    pub fn name(&self) -> &str {
33        &self.name
34    }
35
36    /// Return the length of this sequence in bases.
37    #[must_use]
38    pub fn length(&self) -> usize {
39        self.length
40    }
41}
42
43/// A lookup table mapping reference sequence names to their indices and lengths.
44///
45/// Constructed from a FASTA index, this type consolidates all name-index-length
46/// mapping into a single shared structure.
47#[derive(Debug, Clone)]
48pub struct SequenceDictionary {
49    /// Sequences in index order.
50    sequences: Vec<SequenceMetadata>,
51    /// Name to index mapping.
52    name_to_index: HashMap<String, usize>,
53}
54
55impl SequenceDictionary {
56    /// Return the number of sequences.
57    #[must_use]
58    pub fn len(&self) -> usize {
59        self.sequences.len()
60    }
61
62    /// Return `true` if the dictionary contains no sequences.
63    #[must_use]
64    pub fn is_empty(&self) -> bool {
65        self.sequences.is_empty()
66    }
67
68    /// Look up a sequence by its 0-based index.
69    #[must_use]
70    pub fn get_by_index(&self, index: usize) -> Option<&SequenceMetadata> {
71        self.sequences.get(index)
72    }
73
74    /// Look up a sequence by name.
75    #[must_use]
76    pub fn get_by_name(&self, name: &str) -> Option<&SequenceMetadata> {
77        self.name_to_index.get(name).map(|&i| &self.sequences[i])
78    }
79
80    /// Iterate over all sequences in index order.
81    pub fn iter(&self) -> impl Iterator<Item = &SequenceMetadata> {
82        self.sequences.iter()
83    }
84
85    /// Return contig names in index order.
86    #[must_use]
87    pub fn names(&self) -> Vec<&str> {
88        self.sequences.iter().map(|s| s.name.as_str()).collect()
89    }
90
91    /// Return the total length of all sequences combined.
92    #[must_use]
93    pub fn total_length(&self) -> u64 {
94        self.sequences.iter().map(|s| s.length as u64).sum()
95    }
96
97    /// Build a dictionary from a pre-constructed list of entries.
98    ///
99    /// Useful for testing and for cases where the dictionary is constructed
100    /// from sources other than a FASTA index. Entries are re-indexed by their
101    /// position in the vector (the `index` field of each `SequenceMetadata`
102    /// is ignored and replaced).
103    #[must_use]
104    pub fn from_entries(mut sequences: Vec<SequenceMetadata>) -> Self {
105        for (i, seq) in sequences.iter_mut().enumerate() {
106            seq.index = i;
107        }
108        let name_to_index =
109            sequences.iter().enumerate().map(|(i, s)| (s.name.clone(), i)).collect();
110        Self { sequences, name_to_index }
111    }
112}
113
114impl Index<usize> for SequenceDictionary {
115    type Output = SequenceMetadata;
116
117    fn index(&self, index: usize) -> &SequenceMetadata {
118        &self.sequences[index]
119    }
120}
121
122impl Index<&str> for SequenceDictionary {
123    type Output = SequenceMetadata;
124
125    fn index(&self, name: &str) -> &SequenceMetadata {
126        let i = self.name_to_index[name];
127        &self.sequences[i]
128    }
129}
130
131impl From<&fasta::fai::Index> for SequenceDictionary {
132    fn from(index: &fasta::fai::Index) -> Self {
133        let mut sequences = Vec::new();
134        let mut name_to_index = HashMap::new();
135
136        for (i, record) in index.as_ref().iter().enumerate() {
137            let name = String::from_utf8_lossy(record.name().as_ref()).to_string();
138            #[expect(clippy::cast_possible_truncation, reason = "FASTA index lengths fit in usize")]
139            let length = record.length() as usize;
140            name_to_index.insert(name.clone(), i);
141            sequences.push(SequenceMetadata { index: i, name, length });
142        }
143
144        Self { sequences, name_to_index }
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    /// Build a dictionary from a list of (name, length) pairs for testing.
153    fn make_dict(contigs: &[(&str, usize)]) -> SequenceDictionary {
154        let sequences: Vec<SequenceMetadata> = contigs
155            .iter()
156            .enumerate()
157            .map(|(i, &(name, length))| SequenceMetadata {
158                index: i,
159                name: name.to_string(),
160                length,
161            })
162            .collect();
163        let name_to_index = sequences.iter().map(|s| (s.name.clone(), s.index)).collect();
164        SequenceDictionary { sequences, name_to_index }
165    }
166
167    #[test]
168    fn test_len_and_is_empty() {
169        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
170        assert_eq!(dict.len(), 2);
171        assert!(!dict.is_empty());
172
173        let empty = make_dict(&[]);
174        assert!(empty.is_empty());
175        assert_eq!(empty.len(), 0);
176    }
177
178    #[test]
179    fn test_get_by_index() {
180        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
181        let meta = dict.get_by_index(0).unwrap();
182        assert_eq!(meta.name(), "chr1");
183        assert_eq!(meta.length(), 1000);
184        assert_eq!(meta.index(), 0);
185
186        assert!(dict.get_by_index(2).is_none());
187    }
188
189    #[test]
190    fn test_get_by_name() {
191        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
192        let meta = dict.get_by_name("chr2").unwrap();
193        assert_eq!(meta.index(), 1);
194        assert_eq!(meta.length(), 2000);
195
196        assert!(dict.get_by_name("chrZ").is_none());
197    }
198
199    #[test]
200    fn test_index_by_usize() {
201        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
202        assert_eq!(dict[0].name(), "chr1");
203        assert_eq!(dict[1].name(), "chr2");
204    }
205
206    #[test]
207    #[should_panic(expected = "index out of bounds")]
208    fn test_index_by_usize_out_of_bounds() {
209        let dict = make_dict(&[("chr1", 1000)]);
210        let _ = &dict[5];
211    }
212
213    #[test]
214    fn test_index_by_str() {
215        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
216        assert_eq!(dict["chr1"].length(), 1000);
217    }
218
219    #[test]
220    #[should_panic(expected = "no entry found for key")]
221    fn test_index_by_str_unknown() {
222        let dict = make_dict(&[("chr1", 1000)]);
223        let _ = &dict["nope"];
224    }
225
226    #[test]
227    fn test_names() {
228        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000), ("chrX", 500)]);
229        assert_eq!(dict.names(), vec!["chr1", "chr2", "chrX"]);
230    }
231
232    #[test]
233    fn test_total_length() {
234        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000), ("chrX", 500)]);
235        assert_eq!(dict.total_length(), 3500);
236    }
237
238    #[test]
239    fn test_iter() {
240        let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
241        let names: Vec<&str> = dict.iter().map(SequenceMetadata::name).collect();
242        assert_eq!(names, vec!["chr1", "chr2"]);
243    }
244}