fgumi 0.2.0

High-performance tools for UMI-tagged sequencing data: extraction, grouping, and consensus calling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
//! FASTQ file parsing and read structure handling.
//!
//! This module provides functionality for reading and parsing FASTQ files with support for
//! complex read structures. It can handle reads that contain multiple segments of different
//! types (template, UMI, sample barcodes, cell barcodes) and provides utilities for
//! extracting and manipulating these segments.
//!
//! # Read Structures
//!
//! Read structures describe the layout of bases within a FASTQ read. For example, a read
//! might be structured as "8M143T" meaning 8 bases of molecular barcode (UMI) followed by
//! 143 bases of template sequence.
//!
//! # Example
//!
//! ```rust,ignore
//! use read_structure::ReadStructure;
//! use fgumi_simd_fastq::SimdFastqReader;
//! use std::fs::File;
//! use std::io::BufReader;
//!
//! let rs = ReadStructure::from_str("8M143T").unwrap();
//! let file = File::open("reads.fq").unwrap();
//! let reader = SimdFastqReader::new(
//!     Box::new(BufReader::new(file)) as Box<dyn std::io::BufRead + Send>
//! );
//! let mut iter = ReadSetIterator::new(rs, reader, vec![]);
//!
//! for read_set in iter {
//!     // Process each read set...
//! }
//! ```

use anyhow::{Result, anyhow};
use fgumi_simd_fastq::SimdFastqReader;
use read_structure::ReadStructure;
use read_structure::SegmentType;
use std::fmt::Display;
use std::io::BufRead;
use std::iter::Filter;
use std::slice::Iter;
use std::str::FromStr;

/// Type alias for segment type iter functions, which iterate over the segments of a `FastqSet`
/// filtering for a specific type.
type SegmentIter<'a> = Filter<Iter<'a, FastqSegment>, fn(&&FastqSegment) -> bool>;

/// A segment of a FASTQ record representing bases and qualities of a specific type.
///
/// FASTQ records can be divided into segments based on their biological function
/// (template, UMI, barcode, etc.). Each segment contains the bases, quality scores,
/// and the segment type.
#[derive(PartialEq, Eq, Debug, Clone)]
pub struct FastqSegment {
    /// The nucleotide bases of this segment (A, C, G, T, N)
    pub seq: Vec<u8>,
    /// Phred-scaled quality scores (typically ASCII-encoded as Phred+33)
    pub quals: Vec<u8>,
    /// The biological/functional type of this segment
    pub segment_type: SegmentType,
}

////////////////////////////////////////////////////////////////////////////////
// FastqSet and its impls
////////////////////////////////////////////////////////////////////////////////

/// Reasons why a read might be skipped during processing.
///
/// When processing FASTQ reads with read structures, some reads may not meet
/// the minimum requirements and need to be skipped. This enum categorizes the
/// reasons for skipping.
#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
pub enum SkipReason {
    /// The read had too few bases for the required segments in the read structure
    TooFewBases,
}

impl Display for SkipReason {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            SkipReason::TooFewBases => write!(f, "Too few bases"),
        }
    }
}

impl FromStr for SkipReason {
    type Err = anyhow::Error;
    fn from_str(string: &str) -> Result<Self, Self::Err> {
        match string {
            "too few bases" | "too-few-bases" | "toofewbases" => Ok(SkipReason::TooFewBases),
            _ => Err(anyhow!(
                "Invalid skip reason: '{string}' (valid values: 'too-few-bases', 'toofewbases')"
            )),
        }
    }
}

/// A FASTQ record parsed into its component segments according to a read structure.
///
/// When reading FASTQ files with complex structures (e.g., containing UMIs, barcodes,
/// and template sequences), this structure holds the parsed segments along with the
/// original header. Each segment is classified by type (template, molecular barcode,
/// sample barcode, etc.).
///
/// # Fields
///
/// * `header` - The original FASTQ header line (without the '@' prefix)
/// * `segments` - Vector of parsed segments in order
/// * `skip_reason` - If Some, indicates why this read should be skipped during processing
#[derive(PartialEq, Debug, Clone)]
pub struct FastqSet {
    /// The FASTQ header line (without '@' prefix)
    pub header: Vec<u8>,
    /// Ordered list of parsed segments from this read
    pub segments: Vec<FastqSegment>,
    /// Reason for skipping this read, or None if the read should be processed
    pub skip_reason: Option<SkipReason>,
}

impl FastqSet {
    /// Returns an iterator over template segments.
    ///
    /// Template segments contain the actual biological sequence to be analyzed
    /// (as opposed to barcodes or UMIs).
    ///
    /// # Returns
    ///
    /// Iterator over references to template segments in this read set
    pub fn template_segments(&self) -> SegmentIter<'_> {
        self.segments.iter().filter(|s| s.segment_type == SegmentType::Template)
    }

    /// Returns an iterator over sample barcode segments.
    ///
    /// Sample barcodes identify which sample/library a read belongs to in
    /// multiplexed sequencing runs.
    ///
    /// # Returns
    ///
    /// Iterator over references to sample barcode segments
    pub fn sample_barcode_segments(&self) -> SegmentIter<'_> {
        self.segments.iter().filter(|s| s.segment_type == SegmentType::SampleBarcode)
    }

    /// Returns an iterator over molecular barcode (UMI) segments.
    ///
    /// Molecular barcodes (UMIs) uniquely tag individual molecules for
    /// duplicate detection and quantification.
    ///
    /// # Returns
    ///
    /// Iterator over references to molecular barcode segments
    pub fn molecular_barcode_segments(&self) -> SegmentIter<'_> {
        self.segments.iter().filter(|s| s.segment_type == SegmentType::MolecularBarcode)
    }

    /// Returns an iterator over cell barcode segments.
    ///
    /// Cell barcodes identify which cell a read originated from in
    /// single-cell sequencing experiments.
    ///
    /// # Returns
    ///
    /// Iterator over references to cell barcode segments
    pub fn cell_barcode_segments(&self) -> SegmentIter<'_> {
        self.segments.iter().filter(|s| s.segment_type == SegmentType::CellularBarcode)
    }

    /// Combines multiple `FastqSet` instances into a single set.
    ///
    /// Merges the segments from multiple read sets, preserving the header from
    /// the first set. Useful for combining segments from multiple FASTQ files
    /// that represent different parts of the same read (e.g., R1, R2, I1, I2).
    ///
    /// # Arguments
    ///
    /// * `readsets` - Vector of read sets to combine
    ///
    /// # Returns
    ///
    /// A new `FastqSet` with all segments combined
    ///
    /// # Panics
    ///
    /// Panics if the input vector is empty or contains no segments
    #[must_use]
    pub fn combine_readsets(readsets: Vec<Self>) -> Self {
        let total_segments: usize = readsets.iter().map(|s| s.segments.len()).sum();
        assert!(total_segments > 0, "Cannot call combine readsets on an empty vec!");

        let mut readset_iter = readsets.into_iter();
        let mut first = readset_iter
            .next()
            .expect("readset_iter is non-empty because total_segments > 0 was checked");
        first.segments.reserve_exact(total_segments - first.segments.len());

        for next_readset in readset_iter {
            first.segments.extend(next_readset.segments);
        }

        first
    }

    /// Creates a `FastqSet` from raw FASTQ record data and a read structure.
    ///
    /// This method applies the read structure to segment the sequence and quality
    /// data, producing a `FastqSet` with appropriately typed segments.
    ///
    /// # Arguments
    ///
    /// * `header` - The FASTQ header/name (without '@' prefix)
    /// * `sequence` - The full sequence bases
    /// * `quality` - The full quality scores
    /// * `read_structure` - The read structure describing segment layout
    /// * `skip_reasons` - List of reasons to skip reads gracefully instead of failing
    ///
    /// # Returns
    ///
    /// A `FastqSet` with segments parsed according to the read structure.
    /// If the read doesn't have enough bases and `TooFewBases` is in `skip_reasons`,
    /// returns a `FastqSet` with `skip_reason` set.
    ///
    /// # Errors
    ///
    /// Returns an error if the read has too few bases and `TooFewBases` is not in
    /// `skip_reasons`, or if base/quality extraction fails for any segment.
    pub fn from_record_with_structure(
        header: &[u8],
        sequence: &[u8],
        quality: &[u8],
        read_structure: &ReadStructure,
        skip_reasons: &[SkipReason],
    ) -> anyhow::Result<Self> {
        let mut segments = Vec::with_capacity(read_structure.number_of_segments());

        // Check minimum length required by the read structure
        let min_len: usize = read_structure.iter().map(|s| s.length().unwrap_or(0)).sum();
        if sequence.len() < min_len {
            if skip_reasons.contains(&SkipReason::TooFewBases) {
                return Ok(Self {
                    header: header.to_vec(),
                    segments: vec![],
                    skip_reason: Some(SkipReason::TooFewBases),
                });
            }
            let read_name_str = String::from_utf8_lossy(header);
            anyhow::bail!(
                "Read {} had too few bases to demux {} vs. {} needed in read structure {}.",
                read_name_str,
                sequence.len(),
                min_len,
                read_structure
            );
        }

        // Extract segments according to the read structure
        for (segment_index, read_segment) in read_structure.iter().enumerate() {
            let (seq, quals) =
                read_segment.extract_bases_and_quals(sequence, quality).map_err(|e| {
                    let read_name_str = String::from_utf8_lossy(header);
                    anyhow::anyhow!(
                        "Error extracting bases (len: {}) or quals (len: {}) for the {}th \
                         segment ({}) in read structure ({}) from record {}; {}",
                        sequence.len(),
                        quality.len(),
                        segment_index,
                        read_segment,
                        read_structure,
                        read_name_str,
                        e
                    )
                })?;

            segments.push(FastqSegment {
                seq: seq.to_vec(),
                quals: quals.to_vec(),
                segment_type: read_segment.kind,
            });
        }

        Ok(Self { header: header.to_vec(), segments, skip_reason: None })
    }
}

////////////////////////////////////////////////////////////////////////////////
// ReadSetIterator and its impls
////////////////////////////////////////////////////////////////////////////////

/// Iterator for parsing FASTQ files according to a read structure.
///
/// This iterator reads FASTQ records and parses them into `FastqSet` objects based on
/// the provided read structure. It handles validation and can optionally skip reads that
/// don't meet requirements (e.g., too few bases) rather than panicking.
///
/// # Example
///
/// ```rust,ignore
/// let read_structure = ReadStructure::from_str("8M143T")?;
/// let file = File::open("reads.fq")?;
/// let reader = SimdFastqReader::new(
///     Box::new(BufReader::new(file)) as Box<dyn std::io::BufRead + Send>
/// );
/// let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
///
/// for read_set in iterator {
///     // Process each parsed read set
/// }
/// ```
pub struct ReadSetIterator {
    /// Read structure describing the layout of bases in each read
    read_structure: ReadStructure,
    /// SIMD-accelerated FASTQ file reader
    source: SimdFastqReader<Box<dyn BufRead + Send>>,
    /// Reasons to skip reads instead of panicking (e.g., too few bases)
    skip_reasons: Vec<SkipReason>,
}

impl Iterator for ReadSetIterator {
    type Item = anyhow::Result<FastqSet>;

    fn next(&mut self) -> Option<Self::Item> {
        let rec = self.source.next()?;
        let record = match rec {
            Ok(record) => record,
            Err(e) => {
                return Some(Err(anyhow::Error::from(e).context("Error parsing FASTQ record")));
            }
        };
        Some(FastqSet::from_record_with_structure(
            &record.name,
            &record.sequence,
            &record.quality,
            &self.read_structure,
            &self.skip_reasons,
        ))
    }
}

impl ReadSetIterator {
    /// Creates a new iterator for parsing FASTQ records with a read structure.
    ///
    /// # Arguments
    ///
    /// * `read_structure` - Read structure describing the segment layout
    /// * `source` - FASTQ reader for input
    /// * `skip_reasons` - List of reasons to skip reads gracefully instead of panicking.
    ///   If a read fails for a reason not in this list, the iterator will panic.
    ///
    /// # Returns
    ///
    /// A new `ReadSetIterator` instance
    ///
    /// # Example
    ///
    /// ```rust,ignore
    /// // Skip reads that are too short instead of panicking
    /// let iterator = ReadSetIterator::new(
    ///     read_structure,
    ///     reader,
    ///     vec![SkipReason::TooFewBases]
    /// );
    /// ```
    #[must_use]
    pub fn new(
        read_structure: ReadStructure,
        source: SimdFastqReader<Box<dyn BufRead + Send>>,
        skip_reasons: Vec<SkipReason>,
    ) -> Self {
        Self { read_structure, source, skip_reasons }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn read_set(segments: Vec<FastqSegment>) -> FastqSet {
        FastqSet { header: "NOT_IMPORTANT".as_bytes().to_owned(), segments, skip_reason: None }
    }

    fn seg(bases: &[u8], segment_type: SegmentType) -> FastqSegment {
        let quals = vec![b'#'; bases.len()];
        FastqSegment { seq: bases.to_vec(), quals, segment_type }
    }

    #[test]
    fn test_template_segments() {
        let segments = vec![
            seg("AGCT".as_bytes(), SegmentType::SampleBarcode),
            seg("GATA".as_bytes(), SegmentType::Template),
            seg("CAC".as_bytes(), SegmentType::Template),
            seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
        ];
        let expected = vec![
            seg("GATA".as_bytes(), SegmentType::Template),
            seg("CAC".as_bytes(), SegmentType::Template),
        ];

        let read_set = read_set(segments);

        assert_eq!(expected, read_set.template_segments().cloned().collect::<Vec<_>>());
    }
    #[test]
    fn test_sample_barcode_segments() {
        let segments = vec![
            seg("AGCT".as_bytes(), SegmentType::Template),
            seg("GATA".as_bytes(), SegmentType::SampleBarcode),
            seg("CAC".as_bytes(), SegmentType::SampleBarcode),
            seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
        ];
        let expected = vec![
            seg("GATA".as_bytes(), SegmentType::SampleBarcode),
            seg("CAC".as_bytes(), SegmentType::SampleBarcode),
        ];

        let read_set = read_set(segments);

        assert_eq!(expected, read_set.sample_barcode_segments().cloned().collect::<Vec<_>>());
    }
    #[test]
    fn test_molecular_barcode_segments() {
        let segments = vec![
            seg("AGCT".as_bytes(), SegmentType::Template),
            seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
            seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
            seg("GACCCC".as_bytes(), SegmentType::SampleBarcode),
        ];
        let expected = vec![
            seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
            seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
        ];

        let read_set = read_set(segments);

        assert_eq!(expected, read_set.molecular_barcode_segments().cloned().collect::<Vec<_>>());
    }

    #[test]
    fn test_combine_readsets() {
        let segments1 = vec![
            seg("A".as_bytes(), SegmentType::Template),
            seg("G".as_bytes(), SegmentType::Template),
            seg("C".as_bytes(), SegmentType::MolecularBarcode),
            seg("T".as_bytes(), SegmentType::SampleBarcode),
        ];
        let read_set1 = read_set(segments1.clone());
        let segments2 = vec![
            seg("AA".as_bytes(), SegmentType::Template),
            seg("AG".as_bytes(), SegmentType::Template),
            seg("AC".as_bytes(), SegmentType::MolecularBarcode),
            seg("AT".as_bytes(), SegmentType::SampleBarcode),
        ];
        let read_set2 = read_set(segments2.clone());
        let segments3 = vec![
            seg("AAA".as_bytes(), SegmentType::Template),
            seg("AAG".as_bytes(), SegmentType::Template),
            seg("AAC".as_bytes(), SegmentType::MolecularBarcode),
            seg("AAT".as_bytes(), SegmentType::SampleBarcode),
        ];
        let read_set3 = read_set(segments3.clone());

        let mut expected_segments = Vec::new();
        expected_segments.extend(segments1);
        expected_segments.extend(segments2);
        expected_segments.extend(segments3);
        let expected = read_set(expected_segments);

        let result = FastqSet::combine_readsets(vec![read_set1, read_set2, read_set3]);

        assert_eq!(result, expected);
    }

    #[test]
    #[should_panic(expected = "Cannot call combine readsets on an empty vec!")]
    fn test_combine_readsets_fails_on_empty_vector() {
        let _result = FastqSet::combine_readsets(Vec::new());
    }

    // =====================================================================
    // Cell barcode segment tests
    // =====================================================================

    #[test]
    fn test_cell_barcode_segments() {
        let segments = vec![
            seg("AGCT".as_bytes(), SegmentType::Template),
            seg("GATA".as_bytes(), SegmentType::CellularBarcode),
            seg("CAC".as_bytes(), SegmentType::CellularBarcode),
            seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
        ];
        let expected = vec![
            seg("GATA".as_bytes(), SegmentType::CellularBarcode),
            seg("CAC".as_bytes(), SegmentType::CellularBarcode),
        ];

        let read_set = read_set(segments);

        assert_eq!(expected, read_set.cell_barcode_segments().cloned().collect::<Vec<_>>());
    }

    #[test]
    fn test_cell_barcode_segments_empty() {
        let segments = vec![
            seg("AGCT".as_bytes(), SegmentType::Template),
            seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
        ];

        let read_set = read_set(segments);

        assert_eq!(0, read_set.cell_barcode_segments().count());
    }

    // =====================================================================
    // SkipReason Display and FromStr tests
    // =====================================================================

    #[test]
    fn test_skip_reason_display() {
        assert_eq!(SkipReason::TooFewBases.to_string(), "Too few bases");
    }

    #[test]
    fn test_skip_reason_from_str_valid() {
        // Test various valid forms
        assert_eq!(
            SkipReason::from_str("too few bases")
                .expect("parsing \"too few bases\" should succeed"),
            SkipReason::TooFewBases
        );
        assert_eq!(
            SkipReason::from_str("too-few-bases")
                .expect("parsing \"too-few-bases\" should succeed"),
            SkipReason::TooFewBases
        );
        assert_eq!(
            SkipReason::from_str("toofewbases").expect("parsing \"toofewbases\" should succeed"),
            SkipReason::TooFewBases
        );
    }

    #[test]
    fn test_skip_reason_from_str_invalid() {
        let result = SkipReason::from_str("invalid-reason");
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.to_string().contains("Invalid skip reason"));
        assert!(err.to_string().contains("invalid-reason"));
    }

    // =====================================================================
    // ReadSetIterator tests
    // =====================================================================

    #[test]
    fn test_read_set_iterator_basic() {
        use std::io::Cursor;

        // Create a simple FASTQ record
        let fastq_data = b"@read1\nACGTACGT\n+\nIIIIIIII\n";
        let cursor = Cursor::new(fastq_data.to_vec());
        let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);

        // Simple read structure: 4M (molecular barcode) + 4T (template)
        let read_structure =
            ReadStructure::from_str("4M4T").expect("parsing \"4M4T\" should succeed");
        let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);

        let result = iterator.next();
        assert!(result.is_some());

        let fastq_set =
            result.expect("iterator should yield a record").expect("reading record should succeed");
        assert_eq!(fastq_set.header, b"read1");
        assert_eq!(fastq_set.segments.len(), 2);
        assert!(fastq_set.skip_reason.is_none());

        // First segment should be molecular barcode
        assert_eq!(fastq_set.segments[0].seq, b"ACGT");
        assert_eq!(fastq_set.segments[0].segment_type, SegmentType::MolecularBarcode);

        // Second segment should be template
        assert_eq!(fastq_set.segments[1].seq, b"ACGT");
        assert_eq!(fastq_set.segments[1].segment_type, SegmentType::Template);

        // No more records
        assert!(iterator.next().is_none());
    }

    #[test]
    fn test_read_set_iterator_skip_too_few_bases() {
        use std::io::Cursor;

        // Create a FASTQ record that's too short for the read structure
        let fastq_data = b"@read1\nACGT\n+\nIIII\n";
        let cursor = Cursor::new(fastq_data.to_vec());
        let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);

        // Read structure requires 10 bases total
        let read_structure =
            ReadStructure::from_str("4M6T").expect("parsing \"4M6T\" should succeed");
        let mut iterator =
            ReadSetIterator::new(read_structure, reader, vec![SkipReason::TooFewBases]);

        let result = iterator.next();
        assert!(result.is_some());

        let fastq_set =
            result.expect("iterator should yield a record").expect("reading record should succeed");
        assert_eq!(fastq_set.header, b"read1");
        assert!(fastq_set.segments.is_empty());
        assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
    }

    #[test]
    fn test_read_set_iterator_error_on_too_few_bases_without_skip() {
        use std::io::Cursor;

        // Create a FASTQ record that's too short
        let fastq_data = b"@read1\nACGT\n+\nIIII\n";
        let cursor = Cursor::new(fastq_data.to_vec());
        let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);

        // Read structure requires 10 bases total
        let read_structure =
            ReadStructure::from_str("4M6T").expect("parsing \"4M6T\" should succeed");
        let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);

        // This should return an error because TooFewBases is not in skip_reasons
        let result = iterator.next();
        assert!(result.is_some());
        let err = result.unwrap().unwrap_err();
        assert!(err.to_string().contains("too few bases"));
    }

    #[test]
    fn test_read_set_iterator_multiple_records() {
        use std::io::Cursor;

        let fastq_data = b"@read1\nACGTAAAA\n+\nIIIIIIII\n@read2\nTGCATTTT\n+\nIIIIIIII\n";
        let cursor = Cursor::new(fastq_data.to_vec());
        let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);

        let read_structure =
            ReadStructure::from_str("4M4T").expect("parsing \"4M4T\" should succeed");
        let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);

        // First record
        let first = iterator
            .next()
            .expect("iterator should yield first record")
            .expect("reading first record should succeed");
        assert_eq!(first.header, b"read1");
        assert_eq!(first.segments[0].seq, b"ACGT");
        assert_eq!(first.segments[1].seq, b"AAAA");

        // Second record
        let second = iterator
            .next()
            .expect("iterator should yield second record")
            .expect("reading second record should succeed");
        assert_eq!(second.header, b"read2");
        assert_eq!(second.segments[0].seq, b"TGCA");
        assert_eq!(second.segments[1].seq, b"TTTT");

        // No more records
        assert!(iterator.next().is_none());
    }

    #[test]
    fn test_read_set_iterator_variable_length_segment() {
        use std::io::Cursor;

        // Test with variable-length template segment (last segment gets remaining bases)
        let fastq_data = b"@read1\nACGTTTTTTTTT\n+\nIIIIIIIIIIII\n";
        let cursor = Cursor::new(fastq_data.to_vec());
        let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);

        // 4M + variable T (remaining bases go to template)
        let read_structure =
            ReadStructure::from_str("4M+T").expect("parsing \"4M+T\" should succeed");
        let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);

        let result = iterator
            .next()
            .expect("iterator should yield a record")
            .expect("reading record should succeed");
        assert_eq!(result.segments.len(), 2);

        // Fixed molecular barcode segment
        assert_eq!(result.segments[0].seq, b"ACGT");
        assert_eq!(result.segments[0].segment_type, SegmentType::MolecularBarcode);

        // Variable template gets remaining 8 bases
        assert_eq!(result.segments[1].seq, b"TTTTTTTT");
        assert_eq!(result.segments[1].segment_type, SegmentType::Template);
    }

    #[test]
    fn test_fastq_set_with_skip_reason_only() {
        // Test creating a FastqSet with skip_reason and empty segments
        let fastq_set = FastqSet {
            header: b"skipped_read".to_vec(),
            segments: vec![],
            skip_reason: Some(SkipReason::TooFewBases),
        };

        assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
        assert!(fastq_set.segments.is_empty());
        assert_eq!(fastq_set.template_segments().count(), 0);
        assert_eq!(fastq_set.molecular_barcode_segments().count(), 0);
        assert_eq!(fastq_set.sample_barcode_segments().count(), 0);
        assert_eq!(fastq_set.cell_barcode_segments().count(), 0);
    }
}