Skip to main content

read_structure/
read_structure.rs

1//! Read Structures
2//!
3//! Type [`ReadStructure`] describes the structure of a given read.  A read
4//! contains one or more read segments. A read segment describes a contiguous
5//! stretch of bases of the same type (e.g. template bases).
6//!
7//! At most one segment may be the indefinite-length (`+`) segment meaning "the rest
8//! of the read"; it can appear in any position, not just the terminal one.
9
10use crate::ErrorMessageParts;
11use crate::ReadStructureError;
12use crate::read_segment::ANY_LENGTH_BYTE;
13use crate::read_segment::ReadSegment;
14use crate::segment_type::SegmentType;
15use std::iter::FusedIterator;
16use std::ops::Index;
17
18/// Controls whether [`SegmentType::Skip`] segments are emitted by
19/// [`ReadStructure::extract`].
20#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
21pub enum SkipHandling {
22    /// Emit every segment, including those of type [`SegmentType::Skip`].
23    Include,
24    /// Skip over [`SegmentType::Skip`] segments in the output iterator.
25    Exclude,
26}
27
28/// A read structure made up of one or more [`ReadSegment`]s.
29///
30/// Internally, in addition to the segments themselves, we cache per-segment start
31/// offsets (signed) and enough summary information to resolve the indefinite-length
32/// (`+`) segment's end position at extract time without another pass over the
33/// segments.
34#[derive(Debug, Clone, PartialEq)]
35#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
36#[cfg_attr(feature = "serde", serde(into = "String", try_from = "String"))]
37pub struct ReadStructure {
38    /// The elements that make up the [`ReadStructure`].
39    elements: Vec<ReadSegment>,
40    /// Sum of lengths across all fixed-length (non-`+`) segments.
41    length_of_fixed_segments: usize,
42    /// Index of the indefinite-length (`+`) segment, if any.
43    plus_index: Option<usize>,
44    /// Sum of fixed-length-segment lengths strictly AFTER the `+` segment. Zero when
45    /// there is no `+` or when `+` is terminal. Allows the end of the `+` segment to
46    /// be located as `read_len - post_plus_len` at extract time.
47    post_plus_len: usize,
48    /// Per-element start offset.
49    ///
50    /// Sign carries the frame of reference:
51    /// - `>= 0` — offset from the start of the read. Used for segments before or
52    ///   at the `+`, and for every segment when there is no `+`.
53    /// - `<  0` — distance from the end of the read, stored as a negative number.
54    ///   Used for segments strictly after a non-terminal `+`. The actual start
55    ///   position in a read of length `L` is `L + offset` (i.e., `L - (-offset)`).
56    offsets: Vec<isize>,
57}
58
59impl ReadStructure {
60    /// Builds a new [`ReadStructure`] from a vector of [`ReadSegment`]s.
61    ///
62    /// At most one segment may have an indefinite length (`+`); it may appear in any
63    /// position.
64    ///
65    /// # Errors
66    ///
67    /// - Returns `Err` if no elements exist.
68    /// - Returns `Err` if more than one segment has an indefinite length.
69    #[allow(clippy::missing_panics_doc)]
70    pub fn new(segments: Vec<ReadSegment>) -> Result<Self, ReadStructureError> {
71        if segments.is_empty() {
72            return Err(ReadStructureError::ReadStructureContainsZeroElements);
73        }
74
75        let mut num_indefinite = 0;
76        let mut length_of_fixed_segments = 0;
77        let mut plus_index: Option<usize> = None;
78        for (i, s) in segments.iter().enumerate() {
79            if let Some(len) = s.length {
80                length_of_fixed_segments += len;
81            } else {
82                num_indefinite += 1;
83                if plus_index.is_none() {
84                    plus_index = Some(i);
85                }
86            }
87        }
88
89        if num_indefinite > 1 {
90            return Err(ReadStructureError::ReadStructureMultipleIndefiniteLengthSegments(
91                *segments.iter().find(|s| !s.has_length()).unwrap(),
92            ));
93        }
94
95        // Compute per-element offsets. Two-pass: forward for pre-and-at-`+` (and
96        // for every segment when there is no `+`), then backward for segments
97        // strictly after the `+`.
98        let n = segments.len();
99        let mut offsets = vec![0isize; n];
100
101        // Forward pass up to and including the `+` (or the whole vec if no `+`).
102        let forward_end = plus_index.map_or(n, |p| p + 1);
103        let mut off: usize = 0;
104        for (i, seg) in segments.iter().take(forward_end).enumerate() {
105            offsets[i] = off as isize;
106            off += seg.length.unwrap_or(0);
107        }
108
109        // Backward pass for segments strictly after the `+`, if any.
110        let mut post_plus_len: usize = 0;
111        if let Some(p) = plus_index {
112            // `dist_from_end` is the total number of bases from the *start* of the
113            // current segment to the end of the read. We walk from the end back.
114            let mut dist_from_end: usize = 0;
115            for (i, seg) in segments.iter().enumerate().skip(p + 1).rev() {
116                // Fixed length — invariants: non-`+` segments always have a length.
117                let len = seg.length.expect("post-+ segments must be fixed length");
118                dist_from_end += len;
119                offsets[i] = -(dist_from_end as isize);
120            }
121            post_plus_len = dist_from_end;
122        }
123
124        Ok(ReadStructure {
125            elements: segments,
126            length_of_fixed_segments,
127            plus_index,
128            post_plus_len,
129            offsets,
130        })
131    }
132
133    /// Extracts the bases and quality scores for every segment in this read structure.
134    ///
135    /// Returns a no-alloc iterator over `(&ReadSegment, &[u8] bases, &[u8] quals)` triples.
136    /// All error conditions are checked up front, so the returned iterator is infallible
137    /// and can be cheaply `.collect()`ed into a `Vec` when an owned collection is needed.
138    ///
139    /// `skip_handling` controls whether [`SegmentType::Skip`] segments are emitted
140    /// ([`SkipHandling::Include`]) or silently dropped ([`SkipHandling::Exclude`]).
141    ///
142    /// # Errors
143    ///
144    /// - [`ReadStructureError::MismatchingBasesAndQualsLen`] if `bases.len() != quals.len()`.
145    /// - [`ReadStructureError::ReadTooShort`] if the read cannot accommodate every fixed
146    ///   segment. When the structure also has a `+` segment, the read must be strictly
147    ///   longer than the sum of the fixed-segment lengths (since `+` means at least
148    ///   one base).
149    /// - [`ReadStructureError::ReadTooLong`] if the structure has no `+` segment and
150    ///   `bases.len()` does not exactly match `length_of_fixed_segments`. Silently
151    ///   truncating trailing bases is almost always a bug (wrong structure, stray
152    ///   adapter, etc.), so we require an exact match.
153    pub fn extract<'rs, 'b>(
154        &'rs self,
155        bases: &'b [u8],
156        quals: &'b [u8],
157        skip_handling: SkipHandling,
158    ) -> Result<ExtractedSegments<'rs, 'b>, ReadStructureError> {
159        if bases.len() != quals.len() {
160            return Err(ReadStructureError::MismatchingBasesAndQualsLen {
161                bases_len: bases.len(),
162                quals_len: quals.len(),
163            });
164        }
165
166        let required = if self.plus_index.is_some() {
167            self.length_of_fixed_segments + 1
168        } else {
169            self.length_of_fixed_segments
170        };
171        if bases.len() < required {
172            return Err(ReadStructureError::ReadTooShort { read_len: bases.len(), required });
173        }
174        if self.plus_index.is_none() && bases.len() > self.length_of_fixed_segments {
175            return Err(ReadStructureError::ReadTooLong {
176                read_len: bases.len(),
177                expected: self.length_of_fixed_segments,
178            });
179        }
180
181        Ok(ExtractedSegments {
182            elements: &self.elements,
183            offsets: &self.offsets,
184            plus_index: self.plus_index,
185            post_plus_len: self.post_plus_len,
186            bases,
187            quals,
188            skip_handling,
189            next_index: 0,
190        })
191    }
192
193    /// Returns `true` if the [`ReadStructure`] has a fixed (i.e. non-variable) length,
194    /// `false` if any segment has an indefinite length.
195    pub fn has_fixed_length(&self) -> bool {
196        self.plus_index.is_none()
197    }
198
199    /// Returns the fixed length if there is one.
200    pub fn fixed_length(&self) -> Option<usize> {
201        if self.has_fixed_length() { Some(self.length_of_fixed_segments) } else { None }
202    }
203
204    /// Returns the number of segments in this read structure.
205    pub fn number_of_segments(&self) -> usize {
206        self.elements.len()
207    }
208
209    /// Returns the underlying elements in this read structure.
210    pub fn segments(&self) -> &[ReadSegment] {
211        &self.elements
212    }
213
214    /// Returns an iterator over the read segments.
215    pub fn iter(&self) -> impl Iterator<Item = &ReadSegment> {
216        self.elements.iter()
217    }
218
219    /// Returns the [`ReadSegment`]s in this read structure of the given kind.
220    pub fn segments_by_type(&self, kind: SegmentType) -> impl Iterator<Item = &ReadSegment> {
221        self.elements.iter().filter(move |seg| seg.kind == kind)
222    }
223
224    /// Returns the template [`ReadSegment`]s in this read structure.
225    pub fn templates(&self) -> impl Iterator<Item = &ReadSegment> {
226        self.segments_by_type(SegmentType::Template)
227    }
228
229    /// Returns the sample barcode [`ReadSegment`]s in this read structure.
230    pub fn sample_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
231        self.segments_by_type(SegmentType::SampleBarcode)
232    }
233
234    /// Returns the molecular barcode [`ReadSegment`]s in this read structure.
235    pub fn molecular_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
236        self.segments_by_type(SegmentType::MolecularBarcode)
237    }
238
239    /// Returns the skip [`ReadSegment`]s in this read structure.
240    pub fn skips(&self) -> impl Iterator<Item = &ReadSegment> {
241        self.segments_by_type(SegmentType::Skip)
242    }
243
244    /// Returns the cellular barcode [`ReadSegment`]s in this read structure.
245    pub fn cellular_barcodes(&self) -> impl Iterator<Item = &ReadSegment> {
246        self.segments_by_type(SegmentType::CellularBarcode)
247    }
248
249    /// Returns the first [`ReadSegment`] in this read structure.
250    pub fn first(&self) -> Option<&ReadSegment> {
251        self.elements.first()
252    }
253
254    /// Returns the last [`ReadSegment`] in this read structure.
255    pub fn last(&self) -> Option<&ReadSegment> {
256        self.elements.last()
257    }
258}
259
260/// Iterator returned by [`ReadStructure::extract`].
261///
262/// Yields `(&ReadSegment, &[u8] bases, &[u8] quals)` triples — one per segment in
263/// the underlying read structure, in order (with `Skip` segments optionally filtered
264/// per [`SkipHandling`]). The iterator is infallible: all error checks are performed
265/// up front in [`ReadStructure::extract`].
266#[derive(Debug, Clone)]
267pub struct ExtractedSegments<'rs, 'b> {
268    elements: &'rs [ReadSegment],
269    offsets: &'rs [isize],
270    plus_index: Option<usize>,
271    post_plus_len: usize,
272    bases: &'b [u8],
273    quals: &'b [u8],
274    skip_handling: SkipHandling,
275    next_index: usize,
276}
277
278impl<'rs, 'b> Iterator for ExtractedSegments<'rs, 'b> {
279    type Item = (&'rs ReadSegment, &'b [u8], &'b [u8]);
280
281    fn next(&mut self) -> Option<Self::Item> {
282        while self.next_index < self.elements.len() {
283            let i = self.next_index;
284            self.next_index += 1;
285            let seg = &self.elements[i];
286            if self.skip_handling == SkipHandling::Exclude && seg.kind == SegmentType::Skip {
287                continue;
288            }
289            let (start, end) = if Some(i) == self.plus_index {
290                // The indefinite-length segment: runs from its stored start offset
291                // to just before the post-`+` fixed region.
292                (self.offsets[i] as usize, self.bases.len() - self.post_plus_len)
293            } else {
294                let off = self.offsets[i];
295                let start =
296                    if off >= 0 { off as usize } else { self.bases.len() - ((-off) as usize) };
297                // Non-`+` segments always have a fixed length.
298                let len = seg.length.expect("non-`+` segment must have a length");
299                (start, start + len)
300            };
301            return Some((seg, &self.bases[start..end], &self.quals[start..end]));
302        }
303        None
304    }
305}
306
307impl FusedIterator for ExtractedSegments<'_, '_> {}
308
309impl IntoIterator for ReadStructure {
310    type Item = ReadSegment;
311
312    type IntoIter = std::vec::IntoIter<Self::Item>;
313
314    fn into_iter(self) -> Self::IntoIter {
315        self.elements.into_iter()
316    }
317}
318
319impl Index<usize> for ReadStructure {
320    type Output = ReadSegment;
321
322    /// Returns the [`ReadSegment`] at the given index in the read structure.
323    fn index(&self, idx: usize) -> &Self::Output {
324        &self.elements[idx]
325    }
326}
327
328impl std::fmt::Display for ReadStructure {
329    /// Formats this read structure as a string.
330    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
331        for e in &self.elements {
332            write!(f, "{}", e)?;
333        }
334        Ok(())
335    }
336}
337
338impl std::str::FromStr for ReadStructure {
339    type Err = ReadStructureError;
340
341    /// Returns a new read structure from a string, or `Err` if parsing failed.
342    fn from_str(rs: &str) -> Result<Self, Self::Err> {
343        let mut i = 0;
344        let mut segs: Vec<ReadSegment> = Vec::new();
345        let chars: Vec<char> = rs.to_uppercase().chars().filter(|c| !c.is_whitespace()).collect();
346        while i < chars.len() {
347            // Stash the beginning position of our parsing so we can highlight what we're having trouble with
348            let parse_i = i;
349
350            // Parse out the length segment which many be 1 or more digits or the AnyLengthChar
351            let length = if chars[i] as u8 == ANY_LENGTH_BYTE {
352                i += 1;
353                None
354            } else if chars[i].is_ascii_digit() {
355                let mut len: usize = 0;
356                while i < chars.len() && chars[i].is_ascii_digit() {
357                    // Unwrap is safe since we've checked `is_ascii_digit` already
358                    let digit = chars[i].to_digit(10).unwrap() as usize;
359                    len = (len * 10) + digit;
360                    i += 1;
361                }
362                Some(len)
363            } else {
364                return Err(ReadStructureError::ReadStructureMissingLengthInformation(
365                    ErrorMessageParts::new(&chars, parse_i, parse_i + 1),
366                ));
367            };
368
369            // Parse out the operator and make a segment
370            if chars.len() == i {
371                return Err(ReadStructureError::ReadStructureMissingOperator(
372                    ErrorMessageParts::new(&chars, parse_i, i),
373                ));
374            } else if let Ok(kind) = SegmentType::try_from(chars[i]) {
375                if length == Some(0) {
376                    return Err(ReadStructureError::ReadSegmentLengthZero(ErrorMessageParts::new(
377                        &chars, parse_i, i,
378                    )));
379                }
380                i += 1;
381                segs.push(ReadSegment { length, kind });
382            } else {
383                return Err(ReadStructureError::ReadStructureHadUnknownType(
384                    ErrorMessageParts::new(&chars, parse_i, i + 1),
385                ));
386            }
387        }
388
389        ReadStructure::new(segs)
390    }
391}
392
393impl TryFrom<&[ReadSegment]> for ReadStructure {
394    type Error = ReadStructureError;
395    /// Builds a new read structure from a slice of elements.
396    fn try_from(elements: &[ReadSegment]) -> Result<Self, Self::Error> {
397        Self::new(elements.to_vec())
398    }
399}
400
401impl TryFrom<String> for ReadStructure {
402    type Error = ReadStructureError;
403    /// Parses a read structure from an owned string; equivalent to [`FromStr`].
404    fn try_from(s: String) -> Result<Self, Self::Error> {
405        s.parse()
406    }
407}
408
409impl From<ReadStructure> for String {
410    /// Renders the read structure to its canonical string form (e.g. `"8B+M10T"`).
411    fn from(rs: ReadStructure) -> Self {
412        rs.to_string()
413    }
414}
415
416#[cfg(test)]
417mod test {
418    use crate::ReadStructureError;
419    use crate::read_structure::{ReadStructure, SkipHandling};
420    use crate::segment_type::SegmentType;
421    use std::str::FromStr;
422
423    #[test]
424    fn test_read_structure_from_str() {
425        let rss =
426            ["1T", "1B", "1M", "1S", "101T", "5B101T", "123456789T", "10T10B10B10S10M", "5B2C3T"];
427        for rs in &rss {
428            assert_eq!(ReadStructure::from_str(rs).unwrap().to_string(), *rs);
429        }
430    }
431
432    #[test]
433    fn test_read_structure_from_str_with_whitespace() {
434        let rss = ["75T 8B 8B 75T", " 75T 8B 8B\t75T  "];
435        for rs in &rss {
436            assert_eq!(ReadStructure::from_str(rs).unwrap().to_string(), "75T8B8B75T");
437        }
438    }
439
440    #[test]
441    fn test_read_structure_accepts_plus_at_any_position_once() {
442        assert_eq!(ReadStructure::from_str("5M+T").unwrap().to_string(), "5M+T");
443        assert_eq!(ReadStructure::from_str("+M").unwrap().to_string(), "+M");
444    }
445
446    macro_rules! test_read_structure_from_str_err {
447        ($($name:ident: $value:expr_2021,)*) => {
448        $(
449            #[test]
450            fn $name() {
451                 assert!(ReadStructure::from_str($value).is_err());
452            }
453        )*
454        }
455    }
456
457    test_read_structure_from_str_err! {
458        test_read_structure_rejects_multiple_plus_0: "++M",
459        test_read_structure_rejects_multiple_plus_1: "5M++T",
460        test_read_structure_rejects_multiple_plus_2: "5M70+T",
461        test_read_structure_rejects_multiple_plus_3: "+M+T",
462        test_read_structure_rejects_multiple_plus_4: "5M+T+B",
463    }
464
465    macro_rules! test_read_structure_from_str_invalid {
466        ($($name:ident: $value:expr_2021,)*) => {
467        $(
468            #[test]
469            fn $name() {
470                let (input, expected) = $value;
471                let actual = ReadStructure::from_str(input);
472                assert!(actual.unwrap_err().to_string().ends_with(expected));
473            }
474        )*
475        }
476    }
477
478    test_read_structure_from_str_invalid! {
479        test_read_structure_from_str_invalid_0: ("9R", "[9R]"),
480        test_read_structure_from_str_invalid_1: ("T", "[T]"),
481        test_read_structure_from_str_invalid_2: ("23TT", "23T[T]"),
482        test_read_structure_from_str_invalid_3: ("23T2", "23T[2]"),
483        test_read_structure_from_str_invalid_4: ("23T2TT23T", "23T2T[T]23T"),
484    }
485
486    #[test]
487    fn test_read_structure_collect_segments() {
488        let rs = ReadStructure::from_str("10M9T8B7S3C10M9T8B7S2C").unwrap();
489        let templates: String = rs.templates().map(|s| s.to_string()).collect();
490        assert_eq!(templates, "9T9T");
491        let sample_barcodes: String = rs.sample_barcodes().map(|s| s.to_string()).collect();
492        assert_eq!(sample_barcodes, "8B8B");
493        let molecular_barcodes: String = rs.molecular_barcodes().map(|s| s.to_string()).collect();
494        assert_eq!(molecular_barcodes, "10M10M");
495        let skips: String = rs.skips().map(|s| s.to_string()).collect();
496        assert_eq!(skips, "7S7S");
497        let cellular_barcodes: String = rs.cellular_barcodes().map(|s| s.to_string()).collect();
498        assert_eq!(cellular_barcodes, "3C2C");
499    }
500
501    macro_rules! test_read_structure_length {
502        ($($name:ident: $value:expr_2021,)*) => {
503        $(
504            #[test]
505            fn $name() {
506                let (input, expected) = $value;
507                let actual = ReadStructure::from_str(input).unwrap().number_of_segments();
508                assert_eq!(actual, expected);
509            }
510        )*
511        }
512    }
513
514    test_read_structure_length! {
515        test_read_structure_length_0: ("1T", 1),
516        test_read_structure_length_1: ("1B", 1),
517        test_read_structure_length_2: ("1M", 1),
518        test_read_structure_length_3: ("1S", 1),
519        test_read_structure_length_4: ("101T", 1),
520        test_read_structure_length_5: ("5B101T", 2),
521        test_read_structure_length_6: ("123456789T", 1),
522        test_read_structure_length_7: ("10T10B10B10S10M", 5),
523    }
524
525    macro_rules! test_read_structure_index {
526        ($($name:ident: $value:expr_2021,)*) => {
527        $(
528            #[test]
529            fn $name() {
530                let (string, index, exp_string) = $value;
531                let read_structure = ReadStructure::from_str(string).unwrap();
532                let read_segment = read_structure[index];
533                assert_eq!(read_segment.to_string(), exp_string);
534            }
535        )*
536        }
537    }
538
539    test_read_structure_index! {
540        test_read_structure_index_0: ("1T", 0, "1T"),
541        test_read_structure_index_1: ("1B", 0, "1B"),
542        test_read_structure_index_2: ("1M", 0, "1M"),
543        test_read_structure_index_3: ("1S", 0, "1S"),
544        test_read_structure_index_4: ("101T", 0, "101T"),
545        test_read_structure_index_5: ("5B101T", 0, "5B"),
546        test_read_structure_index_6: ("5B101T", 1, "101T"),
547        test_read_structure_index_7: ("123456789T", 0, "123456789T"),
548        test_read_structure_index_8: ("10T10B10B10S10M", 0, "10T"),
549        test_read_structure_index_9: ("10T10B10B10S10M", 1, "10B"),
550        test_read_structure_index_10: ("10T10B10B10S10M", 2, "10B"),
551        test_read_structure_index_11: ("10T10B10B10S10M", 3, "10S"),
552        test_read_structure_index_12: ("10T10B10B10S10M", 4, "10M"),
553        test_read_structure_index_32: ("10T10B10B10S10C10M", 4, "10C"),
554    }
555
556    #[test]
557    #[cfg(feature = "serde")]
558    fn test_serde() {
559        let rs = ReadStructure::from_str("10T10B10B10S10M").unwrap();
560        let rs_json = serde_json::to_string(&rs).unwrap();
561        let rs2 = serde_json::from_str(&rs_json).unwrap();
562        assert_eq!(rs, rs2);
563    }
564
565    #[test]
566    #[cfg(feature = "serde")]
567    fn test_serde_middle_plus_round_trip() {
568        let rs = ReadStructure::from_str("8B+M10T").unwrap();
569        let rs_json = serde_json::to_string(&rs).unwrap();
570        let rs2: ReadStructure = serde_json::from_str(&rs_json).unwrap();
571        assert_eq!(rs, rs2);
572    }
573
574    #[test]
575    #[cfg(feature = "serde")]
576    fn test_serde_wire_format_is_canonical_string() {
577        // Pin the serialized form: a `ReadStructure` encodes as a single JSON
578        // string, not as an object with internal cached fields.
579        let rs = ReadStructure::from_str("8B+M10T").unwrap();
580        let rs_json = serde_json::to_string(&rs).unwrap();
581        assert_eq!(rs_json, "\"8B+M10T\"");
582    }
583
584    #[test]
585    #[cfg(feature = "serde")]
586    fn test_serde_rejects_invalid_string() {
587        let err = serde_json::from_str::<ReadStructure>("\"not a read structure\"").unwrap_err();
588        // Any deserialization error is acceptable here; just ensure it doesn't
589        // silently produce an inconsistent structure.
590        assert!(!err.to_string().is_empty());
591    }
592
593    // ---- non-terminal `+` acceptance (parsing + round-trip) ----
594
595    #[test]
596    fn test_accepts_middle_plus() {
597        let rs = ReadStructure::from_str("8B+M10T").unwrap();
598        assert_eq!(rs.to_string(), "8B+M10T");
599        assert_eq!(rs.number_of_segments(), 3);
600    }
601
602    #[test]
603    fn test_accepts_leading_plus() {
604        let rs = ReadStructure::from_str("+B10T").unwrap();
605        assert_eq!(rs.to_string(), "+B10T");
606        assert_eq!(rs.number_of_segments(), 2);
607    }
608
609    #[test]
610    fn test_accepts_middle_plus_between_fixed_runs() {
611        let rs = ReadStructure::from_str("10T8B+M10T").unwrap();
612        assert_eq!(rs.to_string(), "10T8B+M10T");
613        assert_eq!(rs.number_of_segments(), 4);
614    }
615
616    // ---- has_fixed_length / fixed_length ----
617
618    #[test]
619    fn test_has_fixed_length_strict() {
620        assert!(ReadStructure::from_str("10T8B").unwrap().has_fixed_length());
621        assert!(!ReadStructure::from_str("10T+M").unwrap().has_fixed_length());
622    }
623
624    #[test]
625    fn test_has_fixed_length_middle_plus() {
626        assert!(!ReadStructure::from_str("8B+M10T").unwrap().has_fixed_length());
627    }
628
629    #[test]
630    fn test_fixed_length_none_for_middle_plus() {
631        assert!(ReadStructure::from_str("8B+M10T").unwrap().fixed_length().is_none());
632    }
633
634    // ---- extraction via ReadStructure::extract ----
635
636    #[test]
637    fn test_extract_fixed_length() {
638        let rs = ReadStructure::from_str("10T8B").unwrap();
639        let bases = b"AAAAAAAAAAGGGGGGGG";
640        let quals = b"IIIIIIIIIIJJJJJJJJ";
641        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
642        assert_eq!(out.len(), 2);
643        assert_eq!(out[0].0.kind, SegmentType::Template);
644        assert_eq!(out[0].1, b"AAAAAAAAAA");
645        assert_eq!(out[0].2, b"IIIIIIIIII");
646        assert_eq!(out[1].0.kind, SegmentType::SampleBarcode);
647        assert_eq!(out[1].1, b"GGGGGGGG");
648        assert_eq!(out[1].2, b"JJJJJJJJ");
649    }
650
651    #[test]
652    fn test_extract_trailing_plus() {
653        let rs = ReadStructure::from_str("10T+M").unwrap();
654        let bases = b"AAAAAAAAAAGGGGGGGGGG";
655        let quals = b"IIIIIIIIIIJJJJJJJJJJ";
656        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
657        assert_eq!(out.len(), 2);
658        assert_eq!(out[0].1, b"AAAAAAAAAA");
659        assert_eq!(out[1].1, b"GGGGGGGGGG");
660        assert_eq!(out[1].2, b"JJJJJJJJJJ");
661    }
662
663    #[test]
664    fn test_extract_leading_plus() {
665        let rs = ReadStructure::from_str("+B10T").unwrap();
666        let bases = b"BBBBBTTTTTTTTTT";
667        let quals = b"!!!!!##########";
668        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
669        assert_eq!(out.len(), 2);
670        assert_eq!(out[0].0.kind, SegmentType::SampleBarcode);
671        assert_eq!(out[0].1, b"BBBBB");
672        assert_eq!(out[0].2, b"!!!!!");
673        assert_eq!(out[1].0.kind, SegmentType::Template);
674        assert_eq!(out[1].1, b"TTTTTTTTTT");
675        assert_eq!(out[1].2, b"##########");
676    }
677
678    #[test]
679    fn test_extract_middle_plus() {
680        let rs = ReadStructure::from_str("8B+M10T").unwrap();
681        let bases = b"BBBBBBBBUUUUUUUUUUUUTTTTTTTTTT";
682        let quals = b"!!!!!!!!@@@@@@@@@@@@##########";
683        assert_eq!(bases.len(), 30);
684        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
685        assert_eq!(out.len(), 3);
686        assert_eq!(out[0].1, b"BBBBBBBB");
687        assert_eq!(out[0].2, b"!!!!!!!!");
688        assert_eq!(out[1].0.kind, SegmentType::MolecularBarcode);
689        assert_eq!(out[1].1, b"UUUUUUUUUUUU");
690        assert_eq!(out[1].2, b"@@@@@@@@@@@@");
691        assert_eq!(out[2].1, b"TTTTTTTTTT");
692        assert_eq!(out[2].2, b"##########");
693    }
694
695    #[test]
696    fn test_extract_multiple_pre_plus_and_post_plus() {
697        // Two pre-plus, one middle plus, one post-plus.
698        let rs = ReadStructure::from_str("10T8B+M10T").unwrap();
699        let bases = b"TTTTTTTTTTBBBBBBBBUUUUUUUUUUUUTTTTTTTTTT";
700        let quals = b"IIIIIIIIII!!!!!!!!@@@@@@@@@@@@##########";
701        assert_eq!(bases.len(), 40);
702        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
703        assert_eq!(out.len(), 4);
704        assert_eq!(out[0].1, b"TTTTTTTTTT");
705        assert_eq!(out[1].1, b"BBBBBBBB");
706        assert_eq!(out[2].1, b"UUUUUUUUUUUU");
707        assert_eq!(out[3].1, b"TTTTTTTTTT");
708    }
709
710    #[test]
711    fn test_extract_include_skips_false_drops_skip() {
712        let rs = ReadStructure::from_str("8S+M10T").unwrap();
713        let bases = b"SSSSSSSSUUUUUUUUUUUUTTTTTTTTTT";
714        let quals = b"????????@@@@@@@@@@@@##########";
715        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Exclude).unwrap().collect();
716        assert_eq!(out.len(), 2);
717        assert_eq!(out[0].0.kind, SegmentType::MolecularBarcode);
718        assert_eq!(out[1].0.kind, SegmentType::Template);
719    }
720
721    #[test]
722    fn test_extract_include_skips_true_keeps_skip() {
723        let rs = ReadStructure::from_str("8S+M10T").unwrap();
724        let bases = b"SSSSSSSSUUUUUUUUUUUUTTTTTTTTTT";
725        let quals = b"????????@@@@@@@@@@@@##########";
726        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
727        assert_eq!(out.len(), 3);
728        assert_eq!(out[0].0.kind, SegmentType::Skip);
729        assert_eq!(out[0].1, b"SSSSSSSS");
730    }
731
732    #[test]
733    fn test_extract_errors_on_bases_quals_length_mismatch() {
734        let rs = ReadStructure::from_str("10T").unwrap();
735        let err = rs.extract(b"AAAAAAAAAA", b"III", SkipHandling::Include).unwrap_err();
736        assert!(matches!(err, ReadStructureError::MismatchingBasesAndQualsLen { .. }));
737    }
738
739    #[test]
740    fn test_extract_errors_when_read_too_short_for_fixed() {
741        let rs = ReadStructure::from_str("10T8B").unwrap();
742        let err = rs.extract(b"AAAA", b"IIII", SkipHandling::Include).unwrap_err();
743        match err {
744            ReadStructureError::ReadTooShort { read_len, required } => {
745                assert_eq!(read_len, 4);
746                assert_eq!(required, 18);
747            }
748            other => panic!("expected ReadTooShort, got {:?}", other),
749        }
750    }
751
752    #[test]
753    fn test_extract_errors_when_read_too_long_for_fixed() {
754        // Fixed structures must get an exact-length read. Trailing bases are almost
755        // always a bug (wrong structure, stray adapter), not data to quietly drop.
756        let rs = ReadStructure::from_str("10T8B").unwrap();
757        let bases = vec![b'X'; 20]; // fixed length is 18
758        let quals = vec![b'#'; 20];
759        let err = rs.extract(&bases, &quals, SkipHandling::Include).unwrap_err();
760        match err {
761            ReadStructureError::ReadTooLong { read_len, expected } => {
762                assert_eq!(read_len, 20);
763                assert_eq!(expected, 18);
764            }
765            other => panic!("expected ReadTooLong, got {:?}", other),
766        }
767    }
768
769    #[test]
770    fn test_extract_allows_extra_bases_when_plus_present() {
771        // With a `+`, extra bases are by definition absorbed by the `+` segment.
772        let rs = ReadStructure::from_str("8B+M10T").unwrap();
773        let bases = b"BBBBBBBBUUUUUUUUUUUUUUUUUUUUUUUUTTTTTTTTTT";
774        let quals = b"!!!!!!!!@@@@@@@@@@@@@@@@@@@@@@@@##########";
775        assert_eq!(bases.len(), 42);
776        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
777        assert_eq!(out.len(), 3);
778        assert_eq!(out[1].1.len(), 24); // 42 - 8 - 10
779    }
780
781    #[test]
782    fn test_extract_errors_when_read_exactly_fixed_len_but_plus_present() {
783        // `+` requires at least one base, so read length must exceed fixed-length total.
784        let rs = ReadStructure::from_str("8B+M10T").unwrap();
785        let bases = vec![b'X'; 18]; // == length_of_fixed_segments
786        let quals = vec![b'#'; 18];
787        let err = rs.extract(&bases, &quals, SkipHandling::Include).unwrap_err();
788        match err {
789            ReadStructureError::ReadTooShort { read_len, required } => {
790                assert_eq!(read_len, 18);
791                assert_eq!(required, 19);
792            }
793            other => panic!("expected ReadTooShort, got {:?}", other),
794        }
795    }
796
797    #[test]
798    fn test_extract_allows_read_exactly_fixed_len_when_no_plus() {
799        let rs = ReadStructure::from_str("10T8B").unwrap();
800        let bases = vec![b'X'; 18];
801        let quals = vec![b'#'; 18];
802        let out: Vec<_> = rs.extract(&bases, &quals, SkipHandling::Include).unwrap().collect();
803        assert_eq!(out.len(), 2);
804    }
805
806    #[test]
807    fn test_extract_plus_only_structure() {
808        // Entire read is one indefinite-length template segment.
809        let rs = ReadStructure::from_str("+T").unwrap();
810        let bases = b"AAAAAAAAAA";
811        let quals = b"IIIIIIIIII";
812        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
813        assert_eq!(out.len(), 1);
814        assert_eq!(out[0].0.kind, SegmentType::Template);
815        assert_eq!(out[0].1, bases);
816        assert_eq!(out[0].2, quals);
817    }
818
819    #[test]
820    fn test_extract_plus_yields_one_base_at_minimum_length() {
821        // For `"8B+M10T"` the minimum read length is fixed + 1 = 19; at that length
822        // the `+M` segment must contain exactly one base.
823        let rs = ReadStructure::from_str("8B+M10T").unwrap();
824        let bases = b"BBBBBBBBMTTTTTTTTTT";
825        let quals = b"!!!!!!!!@##########";
826        assert_eq!(bases.len(), 19);
827        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
828        assert_eq!(out.len(), 3);
829        assert_eq!(out[0].1, b"BBBBBBBB");
830        assert_eq!(out[1].0.kind, SegmentType::MolecularBarcode);
831        assert_eq!(out[1].1, b"M");
832        assert_eq!(out[1].2, b"@");
833        assert_eq!(out[2].1, b"TTTTTTTTTT");
834    }
835
836    #[test]
837    fn test_extract_multiple_post_plus_segments() {
838        // Two fixed segments after the `+` exercise the backward offset pass.
839        let rs = ReadStructure::from_str("8B+M5T5S").unwrap();
840        let bases = b"BBBBBBBBUUUUUUUUUUUUTTTTTSSSSS";
841        let quals = b"!!!!!!!!@@@@@@@@@@@@#####?????";
842        assert_eq!(bases.len(), 30);
843        let out: Vec<_> = rs.extract(bases, quals, SkipHandling::Include).unwrap().collect();
844        assert_eq!(out.len(), 4);
845        assert_eq!(out[0].1, b"BBBBBBBB");
846        assert_eq!(out[1].0.kind, SegmentType::MolecularBarcode);
847        assert_eq!(out[1].1, b"UUUUUUUUUUUU");
848        assert_eq!(out[2].0.kind, SegmentType::Template);
849        assert_eq!(out[2].1, b"TTTTT");
850        assert_eq!(out[3].0.kind, SegmentType::Skip);
851        assert_eq!(out[3].1, b"SSSSS");
852    }
853}