holodeck_lib/
sequence_dict.rs1use std::collections::HashMap;
2use std::ops::Index;
3
4use noodles::fasta;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct SequenceMetadata {
9 index: usize,
11 name: String,
13 length: usize,
15}
16
17impl SequenceMetadata {
18 #[must_use]
20 pub fn new(index: usize, name: String, length: usize) -> Self {
21 Self { index, name, length }
22 }
23
24 #[must_use]
26 pub fn index(&self) -> usize {
27 self.index
28 }
29
30 #[must_use]
32 pub fn name(&self) -> &str {
33 &self.name
34 }
35
36 #[must_use]
38 pub fn length(&self) -> usize {
39 self.length
40 }
41}
42
43#[derive(Debug, Clone)]
48pub struct SequenceDictionary {
49 sequences: Vec<SequenceMetadata>,
51 name_to_index: HashMap<String, usize>,
53}
54
55impl SequenceDictionary {
56 #[must_use]
58 pub fn len(&self) -> usize {
59 self.sequences.len()
60 }
61
62 #[must_use]
64 pub fn is_empty(&self) -> bool {
65 self.sequences.is_empty()
66 }
67
68 #[must_use]
70 pub fn get_by_index(&self, index: usize) -> Option<&SequenceMetadata> {
71 self.sequences.get(index)
72 }
73
74 #[must_use]
76 pub fn get_by_name(&self, name: &str) -> Option<&SequenceMetadata> {
77 self.name_to_index.get(name).map(|&i| &self.sequences[i])
78 }
79
80 pub fn iter(&self) -> impl Iterator<Item = &SequenceMetadata> {
82 self.sequences.iter()
83 }
84
85 #[must_use]
87 pub fn names(&self) -> Vec<&str> {
88 self.sequences.iter().map(|s| s.name.as_str()).collect()
89 }
90
91 #[must_use]
93 pub fn total_length(&self) -> u64 {
94 self.sequences.iter().map(|s| s.length as u64).sum()
95 }
96
97 #[must_use]
104 pub fn from_entries(mut sequences: Vec<SequenceMetadata>) -> Self {
105 for (i, seq) in sequences.iter_mut().enumerate() {
106 seq.index = i;
107 }
108 let name_to_index =
109 sequences.iter().enumerate().map(|(i, s)| (s.name.clone(), i)).collect();
110 Self { sequences, name_to_index }
111 }
112}
113
114impl Index<usize> for SequenceDictionary {
115 type Output = SequenceMetadata;
116
117 fn index(&self, index: usize) -> &SequenceMetadata {
118 &self.sequences[index]
119 }
120}
121
122impl Index<&str> for SequenceDictionary {
123 type Output = SequenceMetadata;
124
125 fn index(&self, name: &str) -> &SequenceMetadata {
126 let i = self.name_to_index[name];
127 &self.sequences[i]
128 }
129}
130
131impl From<&fasta::fai::Index> for SequenceDictionary {
132 fn from(index: &fasta::fai::Index) -> Self {
133 let mut sequences = Vec::new();
134 let mut name_to_index = HashMap::new();
135
136 for (i, record) in index.as_ref().iter().enumerate() {
137 let name = String::from_utf8_lossy(record.name().as_ref()).to_string();
138 #[expect(clippy::cast_possible_truncation, reason = "FASTA index lengths fit in usize")]
139 let length = record.length() as usize;
140 name_to_index.insert(name.clone(), i);
141 sequences.push(SequenceMetadata { index: i, name, length });
142 }
143
144 Self { sequences, name_to_index }
145 }
146}
147
148#[cfg(test)]
149mod tests {
150 use super::*;
151
152 fn make_dict(contigs: &[(&str, usize)]) -> SequenceDictionary {
154 let sequences: Vec<SequenceMetadata> = contigs
155 .iter()
156 .enumerate()
157 .map(|(i, &(name, length))| SequenceMetadata {
158 index: i,
159 name: name.to_string(),
160 length,
161 })
162 .collect();
163 let name_to_index = sequences.iter().map(|s| (s.name.clone(), s.index)).collect();
164 SequenceDictionary { sequences, name_to_index }
165 }
166
167 #[test]
168 fn test_len_and_is_empty() {
169 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
170 assert_eq!(dict.len(), 2);
171 assert!(!dict.is_empty());
172
173 let empty = make_dict(&[]);
174 assert!(empty.is_empty());
175 assert_eq!(empty.len(), 0);
176 }
177
178 #[test]
179 fn test_get_by_index() {
180 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
181 let meta = dict.get_by_index(0).unwrap();
182 assert_eq!(meta.name(), "chr1");
183 assert_eq!(meta.length(), 1000);
184 assert_eq!(meta.index(), 0);
185
186 assert!(dict.get_by_index(2).is_none());
187 }
188
189 #[test]
190 fn test_get_by_name() {
191 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
192 let meta = dict.get_by_name("chr2").unwrap();
193 assert_eq!(meta.index(), 1);
194 assert_eq!(meta.length(), 2000);
195
196 assert!(dict.get_by_name("chrZ").is_none());
197 }
198
199 #[test]
200 fn test_index_by_usize() {
201 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
202 assert_eq!(dict[0].name(), "chr1");
203 assert_eq!(dict[1].name(), "chr2");
204 }
205
206 #[test]
207 #[should_panic(expected = "index out of bounds")]
208 fn test_index_by_usize_out_of_bounds() {
209 let dict = make_dict(&[("chr1", 1000)]);
210 let _ = &dict[5];
211 }
212
213 #[test]
214 fn test_index_by_str() {
215 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
216 assert_eq!(dict["chr1"].length(), 1000);
217 }
218
219 #[test]
220 #[should_panic(expected = "no entry found for key")]
221 fn test_index_by_str_unknown() {
222 let dict = make_dict(&[("chr1", 1000)]);
223 let _ = &dict["nope"];
224 }
225
226 #[test]
227 fn test_names() {
228 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000), ("chrX", 500)]);
229 assert_eq!(dict.names(), vec!["chr1", "chr2", "chrX"]);
230 }
231
232 #[test]
233 fn test_total_length() {
234 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000), ("chrX", 500)]);
235 assert_eq!(dict.total_length(), 3500);
236 }
237
238 #[test]
239 fn test_iter() {
240 let dict = make_dict(&[("chr1", 1000), ("chr2", 2000)]);
241 let names: Vec<&str> = dict.iter().map(SequenceMetadata::name).collect();
242 assert_eq!(names, vec!["chr1", "chr2"]);
243 }
244}