read_structure/
read_structure.rs

1//! Read Structures
2//!
3//! Type [`ReadStructure`] describes the structure of a given read.  A read
4//! contains one or more read segments. A read segment describes a contiguous
5//! stretch of bases of the same type (e.g. template bases) of some length and
6//! some offset from the start of the read.
7
8use crate::read_segment;
9use crate::read_segment::ReadSegment;
10use crate::read_segment::ANY_LENGTH_BYTE;
11use crate::segment_type::SegmentType;
12use crate::ErrorMessageParts;
13use crate::ReadStructureError;
14use std::convert::TryFrom;
15use std::ops::Index;
16use std::string;
17use std::string::ToString;
18
19/// The read structure composed of one or more [`ReadSegment`]s.
20#[derive(Debug, Clone, PartialEq)]
21#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
22pub struct ReadStructure {
23    /// The elements that make up the [`ReadStructure`].
24    elements: Vec<ReadSegment>,
25    /// The combined length of fixed length segments.
26    length_of_fixed_segments: usize,
27}
28
29impl ReadStructure {
30    /// Builds a new [`ReadStructure`] from a vector of [`ReadSegment`]s.  The offsets
31    /// for the [`ReadSegment`]s are not updated.
32    // pub fn new(elements: Vec<ReadSegment>) -> Result<Self, ReadStructureError> {
33    //     let min_len = elements.iter().map(|elem| elem.length.unwrap_or(0)).sum();
34    //     Ok(ReadStructure { elements, length_of_fixed_segments: min_len })
35    // }
36
37    /// Builds a new [`ReadStructure`] from a vector of [`ReadSegment`]s.
38    ///
39    /// # Errors
40    ///
41    /// Returns `Err` if the any segment but the last has an indefinite length, or no elements
42    /// exist.
43    #[allow(clippy::missing_panics_doc)]
44    pub fn new(mut segments: Vec<ReadSegment>) -> Result<Self, ReadStructureError> {
45        if segments.is_empty() {
46            return Err(ReadStructureError::ReadStructureContainsZeroElements);
47        }
48
49        let mut num_indefinite = 0;
50        let mut length_of_fixed_segments = 0;
51        for s in &segments {
52            if let Some(len) = s.length {
53                length_of_fixed_segments += len;
54            } else {
55                num_indefinite += 1;
56            }
57        }
58
59        if segments.last().unwrap().has_length() {
60            if num_indefinite != 0 {
61                return Err(
62                    ReadStructureError::ReadStructureNonTerminalIndefiniteLengthReadSegment(
63                        *segments.iter().find(|s| !s.has_length()).unwrap(),
64                    ),
65                );
66            }
67        } else if num_indefinite > 1 {
68            return Err(ReadStructureError::ReadStructureNonTerminalIndefiniteLengthReadSegment(
69                *segments.iter().find(|s| !s.has_length()).unwrap(),
70            ));
71        }
72
73        let mut off: usize = 0;
74        for segment in &mut segments {
75            segment.offset = off;
76            off += segment.length.unwrap_or(0);
77        }
78        Ok(ReadStructure { elements: segments, length_of_fixed_segments })
79    }
80
81    /// Returns `true` if the [`ReadStructure`] has a fixed (i.e. non-variable) length,
82    /// `false` if there are segments but no fixed length.
83    pub fn has_fixed_length(&self) -> bool {
84        self.elements.last().unwrap().has_length()
85    }
86
87    /// Returns the fixed length if there is one.
88    pub fn fixed_length(&self) -> Option<usize> {
89        if self.has_fixed_length() {
90            Some(self.length_of_fixed_segments)
91        } else {
92            None
93        }
94    }
95
96    /// Returns the number of segments in this read structure.
97    pub fn number_of_segments(&self) -> usize {
98        self.elements.len()
99    }
100
101    /// Returns the underlying elements in this read structure.
102    pub fn segments(&self) -> &[ReadSegment] {
103        &self.elements
104    }
105
106    /// Returns an iterator over the read segments
107    pub fn iter(&self) -> impl Iterator<Item = &ReadSegment> {
108        self.elements.iter()
109    }
110
111    /// Returns the [`ReadSegment`]s in this read structure of the given kind.
112    pub fn segments_by_type(&self, kind: SegmentType) -> impl Iterator<Item = &ReadSegment> {
113        self.elements.iter().filter(move |seg| seg.kind == kind)
114    }
115
116    /// Returns the template [`ReadSegment`]s in this read structure
117    pub fn templates(&self) -> impl Iterator<Item = &ReadSegment> {
118        self.segments_by_type(SegmentType::Template)
119    }
120
121    /// Returns the sample barcode [`ReadSegment`]s in this read structure
122    pub fn sample_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
123        self.segments_by_type(SegmentType::SampleBarcode)
124    }
125
126    /// Returns the molecular barcode [`ReadSegment`]s in this read structure
127    pub fn molecular_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
128        self.segments_by_type(SegmentType::MolecularBarcode)
129    }
130
131    /// Returns the skip [`ReadSegment`]s in this read structure
132    pub fn skips(&self) -> impl Iterator<Item = &ReadSegment> {
133        self.segments_by_type(SegmentType::Skip)
134    }
135
136    /// Returns the cellular barcode [`ReadSegment`]s in this read structure
137    pub fn cellular_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
138        self.segments_by_type(SegmentType::CellularBarcode)
139    }
140
141    /// Returns the first [`ReadSegment`] in this read structure
142    pub fn first(&self) -> Option<&ReadSegment> {
143        self.elements.first()
144    }
145
146    /// Returns the last [`ReadSegment`] in this read structure
147    pub fn last(&self) -> Option<&ReadSegment> {
148        self.elements.last()
149    }
150}
151
152impl IntoIterator for ReadStructure {
153    type Item = ReadSegment;
154
155    type IntoIter = std::vec::IntoIter<Self::Item>;
156
157    fn into_iter(self) -> Self::IntoIter {
158        self.elements.into_iter()
159    }
160}
161
162impl Index<usize> for ReadStructure {
163    type Output = ReadSegment;
164
165    /// Returns the [`ReadSegment`] at the given index in the read structure.
166    fn index(&self, idx: usize) -> &Self::Output {
167        &self.elements[idx]
168    }
169}
170
171impl std::fmt::Display for ReadStructure {
172    /// Formats this read structure as a string.
173    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
174        for e in &self.elements {
175            write!(f, "{}", e)?;
176        }
177        Ok(())
178    }
179}
180
181impl std::str::FromStr for ReadStructure {
182    type Err = ReadStructureError;
183
184    /// Returns a new read structure from a string, or `Err` if parsing failed.
185    fn from_str(rs: &str) -> Result<Self, Self::Err> {
186        let mut offset = 0;
187        let mut i = 0;
188        let mut segs: Vec<ReadSegment> = Vec::new();
189        let chars: Vec<char> = rs.to_uppercase().chars().filter(|c| !c.is_whitespace()).collect();
190        while i < chars.len() {
191            // Stash the beginning position of our parsing so we can highlight what we're having trouble with
192            let parse_i = i;
193
194            // Parse out the length segment which many be 1 or more digits or the AnyLengthChar
195            let length = if chars[i] as u8 == ANY_LENGTH_BYTE {
196                i += 1;
197                None
198            } else if chars[i].is_digit(10) {
199                let mut len: usize = 0;
200                while i < chars.len() && chars[i].is_digit(10) {
201                    // Unwrap is save since we've checked `is_digit` already
202                    let digit = chars[i].to_digit(10).unwrap() as usize;
203                    len = (len * 10) + digit;
204                    i += 1;
205                }
206                Some(len)
207            } else {
208                return Err(ReadStructureError::ReadStructureMissingLengthInformation(
209                    ErrorMessageParts::new(&chars, parse_i, parse_i + 1),
210                ));
211            };
212
213            // Parse out the operator and make a segment
214            if chars.len() == i {
215                return Err(ReadStructureError::ReadStructureMissingOperator(
216                    ErrorMessageParts::new(&chars, parse_i, i),
217                ));
218            } else if let Ok(kind) = SegmentType::try_from(chars[i]) {
219                if length.map_or(false, |l| l == 0) {
220                    return Err(ReadStructureError::ReadSegmentLengthZero(ErrorMessageParts::new(
221                        &chars, parse_i, i,
222                    )));
223                }
224                i += 1;
225                segs.push(ReadSegment { offset, length, kind });
226                offset += length.unwrap_or(0);
227            } else {
228                return Err(ReadStructureError::ReadStructureHadUnknownType(
229                    ErrorMessageParts::new(&chars, parse_i, i + 1),
230                ));
231            }
232        }
233
234        ReadStructure::new(segs)
235    }
236}
237
238impl TryFrom<&[ReadSegment]> for ReadStructure {
239    type Error = ReadStructureError;
240    /// Builds a new read structure from a slice of elements.
241    fn try_from(elements: &[ReadSegment]) -> Result<Self, Self::Error> {
242        Self::new(elements.to_vec())
243    }
244}
245
246#[cfg(test)]
247mod test {
248    use crate::read_structure::ReadStructure;
249    use std::str::FromStr;
250
251    #[test]
252    fn test_read_structure_from_str() {
253        let rss =
254            ["1T", "1B", "1M", "1S", "101T", "5B101T", "123456789T", "10T10B10B10S10M", "5B2C3T"];
255        for rs in &rss {
256            assert_eq!(ReadStructure::from_str(rs).unwrap().to_string(), *rs);
257        }
258    }
259
260    #[test]
261    fn test_read_structure_from_str_with_whitespace() {
262        let rss = ["75T 8B 8B 75T", " 75T 8B 8B\t75T  "];
263        for rs in &rss {
264            assert_eq!(ReadStructure::from_str(rs).unwrap().to_string(), "75T8B8B75T");
265        }
266    }
267
268    #[test]
269    fn test_read_structure_allow_anylength_char_only_once_and_for_last_segment() {
270        assert_eq!(ReadStructure::from_str("5M+T").unwrap().to_string(), "5M+T");
271        assert_eq!(ReadStructure::from_str("+M").unwrap().to_string(), "+M");
272    }
273
274    macro_rules! test_read_structure_from_str_err {
275        ($($name:ident: $value:expr,)*) => {
276        $(
277            #[test]
278            fn $name() {
279                 assert!(ReadStructure::from_str($value).is_err());
280            }
281        )*
282        }
283    }
284
285    test_read_structure_from_str_err! {
286        test_read_structure_allow_any_char_only_once_and_for_last_segment_panic_0: "++M",
287        test_read_structure_allow_any_char_only_once_and_for_last_segment_panic_1: "5M++T",
288        test_read_structure_allow_any_char_only_once_and_for_last_segment_panic_2: "5M70+T",
289        test_read_structure_allow_any_char_only_once_and_for_last_segment_panic_3: "+M+T",
290        test_read_structure_allow_any_char_only_once_and_for_last_segment_panic_4: "+M70T",
291    }
292
293    macro_rules! test_read_structure_from_str_invalid {
294        ($($name:ident: $value:expr,)*) => {
295        $(
296            #[test]
297            fn $name() {
298                let (input, expected) = $value;
299                let actual = ReadStructure::from_str(input);
300                assert!(actual.unwrap_err().to_string().ends_with(expected));
301            }
302        )*
303        }
304    }
305
306    test_read_structure_from_str_invalid! {
307        test_read_structure_from_str_invalid_0: ("9R", "[9R]"),
308        test_read_structure_from_str_invalid_1: ("T", "[T]"),
309        test_read_structure_from_str_invalid_2: ("23TT", "23T[T]"),
310        test_read_structure_from_str_invalid_3: ("23T2", "23T[2]"),
311        test_read_structure_from_str_invalid_4: ("23T2TT23T", "23T2T[T]23T"),
312    }
313
314    #[test]
315    fn test_read_structure_collect_segments() {
316        let rs = ReadStructure::from_str("10M9T8B7S3C10M9T8B7S2C").unwrap();
317        let templates: String = rs.templates().map(|s| s.to_string()).collect();
318        assert_eq!(templates, "9T9T");
319        let sample_barcodes: String = rs.sample_barcodes().map(|s| s.to_string()).collect();
320        assert_eq!(sample_barcodes, "8B8B");
321        let molecular_barcodes: String = rs.molecular_barcodes().map(|s| s.to_string()).collect();
322        assert_eq!(molecular_barcodes, "10M10M");
323        let skips: String = rs.skips().map(|s| s.to_string()).collect();
324        assert_eq!(skips, "7S7S");
325        let cellular_barcodes: String = rs.cellular_barcodes().map(|s| s.to_string()).collect();
326        assert_eq!(cellular_barcodes, "3C2C");
327    }
328
329    macro_rules! test_read_structure_length {
330        ($($name:ident: $value:expr,)*) => {
331        $(
332            #[test]
333            fn $name() {
334                let (input, expected) = $value;
335                let actual = ReadStructure::from_str(input).unwrap().number_of_segments();
336                assert_eq!(actual, expected);
337            }
338        )*
339        }
340    }
341
342    test_read_structure_length! {
343        test_read_structure_length_0: ("1T", 1),
344        test_read_structure_length_1: ("1B", 1),
345        test_read_structure_length_2: ("1M", 1),
346        test_read_structure_length_3: ("1S", 1),
347        test_read_structure_length_4: ("101T", 1),
348        test_read_structure_length_5: ("5B101T", 2),
349        test_read_structure_length_6: ("123456789T", 1),
350        test_read_structure_length_7: ("10T10B10B10S10M", 5),
351    }
352
353    macro_rules! test_read_structure_index {
354        ($($name:ident: $value:expr,)*) => {
355        $(
356            #[test]
357            fn $name() {
358                let (string, index, exp_string, exp_offset) = $value;
359                let read_structure = ReadStructure::from_str(string).unwrap();
360                let read_segment = read_structure[index];
361                assert_eq!(read_segment.to_string(), exp_string);
362                assert_eq!(read_segment.offset, exp_offset);
363            }
364        )*
365        }
366    }
367
368    test_read_structure_index! {
369        test_read_structure_index_0: ("1T", 0, "1T", 0),
370        test_read_structure_index_1: ("1B", 0, "1B", 0),
371        test_read_structure_index_2: ("1M", 0, "1M", 0),
372        test_read_structure_index_3: ("1S", 0, "1S", 0),
373        test_read_structure_index_4: ("101T", 0, "101T", 0),
374        test_read_structure_index_5: ("5B101T", 0, "5B", 0),
375        test_read_structure_index_6: ("5B101T", 1, "101T", 5),
376        test_read_structure_index_7: ("123456789T", 0, "123456789T", 0),
377        test_read_structure_index_8: ("10T10B10B10S10M", 0, "10T", 0),
378        test_read_structure_index_9: ("10T10B10B10S10M", 1, "10B", 10),
379        test_read_structure_index_10: ("10T10B10B10S10M", 2, "10B", 20),
380        test_read_structure_index_11: ("10T10B10B10S10M", 3, "10S", 30),
381        test_read_structure_index_12: ("10T10B10B10S10M", 4, "10M", 40),
382        test_read_structure_index_32: ("10T10B10B10S10C10M", 4, "10C", 40),
383    }
384
385    #[test]
386    #[cfg(feature = "serde")]
387    fn test_serde() {
388        let rs = ReadStructure::from_str("10T10B10B10S10M").unwrap();
389        let rs_json = serde_json::to_string(&rs).unwrap();
390        let rs2 = serde_json::from_str(&rs_json).unwrap();
391        assert_eq!(rs, rs2);
392    }
393}