1use anyhow::{Result, anyhow};
33use read_structure::ReadStructure;
34use read_structure::SegmentType;
35use seq_io::fastq::Reader as FastqReader;
36use seq_io::fastq::Record;
37use std::fmt::Display;
38use std::io::BufRead;
39use std::iter::Filter;
40use std::slice::Iter;
41use std::str::FromStr;
42
43type SegmentIter<'a> = Filter<Iter<'a, FastqSegment>, fn(&&FastqSegment) -> bool>;
46
47#[derive(PartialEq, Eq, Debug, Clone)]
53pub struct FastqSegment {
54 pub seq: Vec<u8>,
56 pub quals: Vec<u8>,
58 pub segment_type: SegmentType,
60}
61
62#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
72pub enum SkipReason {
73 TooFewBases,
75}
76
77impl Display for SkipReason {
78 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79 match self {
80 SkipReason::TooFewBases => write!(f, "Too few bases"),
81 }
82 }
83}
84
85impl FromStr for SkipReason {
86 type Err = anyhow::Error;
87 fn from_str(string: &str) -> Result<Self, Self::Err> {
88 match string {
89 "too few bases" | "too-few-bases" | "toofewbases" => Ok(SkipReason::TooFewBases),
90 _ => Err(anyhow!(
91 "Invalid skip reason: '{string}' (valid values: 'too-few-bases', 'toofewbases')"
92 )),
93 }
94 }
95}
96
97#[derive(PartialEq, Debug, Clone)]
110pub struct FastqSet {
111 pub header: Vec<u8>,
113 pub segments: Vec<FastqSegment>,
115 pub skip_reason: Option<SkipReason>,
117}
118
119impl FastqSet {
120 pub fn template_segments(&self) -> SegmentIter<'_> {
129 self.segments.iter().filter(|s| s.segment_type == SegmentType::Template)
130 }
131
132 pub fn sample_barcode_segments(&self) -> SegmentIter<'_> {
141 self.segments.iter().filter(|s| s.segment_type == SegmentType::SampleBarcode)
142 }
143
144 pub fn molecular_barcode_segments(&self) -> SegmentIter<'_> {
153 self.segments.iter().filter(|s| s.segment_type == SegmentType::MolecularBarcode)
154 }
155
156 pub fn cell_barcode_segments(&self) -> SegmentIter<'_> {
165 self.segments.iter().filter(|s| s.segment_type == SegmentType::CellularBarcode)
166 }
167
168 #[must_use]
186 pub fn combine_readsets(readsets: Vec<Self>) -> Self {
187 let total_segments: usize = readsets.iter().map(|s| s.segments.len()).sum();
188 assert!(total_segments > 0, "Cannot call combine readsets on an empty vec!");
189
190 let mut readset_iter = readsets.into_iter();
191 let mut first = readset_iter
192 .next()
193 .expect("readset_iter is non-empty because total_segments > 0 was checked");
194 first.segments.reserve_exact(total_segments - first.segments.len());
195
196 for next_readset in readset_iter {
197 first.segments.extend(next_readset.segments);
198 }
199
200 first
201 }
202
203 #[must_use]
226 pub fn from_record_with_structure(
227 header: &[u8],
228 sequence: &[u8],
229 quality: &[u8],
230 read_structure: &ReadStructure,
231 skip_reasons: &[SkipReason],
232 ) -> Self {
233 let mut segments = Vec::with_capacity(read_structure.number_of_segments());
234
235 let min_len: usize = read_structure.iter().map(|s| s.length().unwrap_or(0)).sum();
237 if sequence.len() < min_len {
238 if skip_reasons.contains(&SkipReason::TooFewBases) {
239 return Self {
240 header: header.to_vec(),
241 segments: vec![],
242 skip_reason: Some(SkipReason::TooFewBases),
243 };
244 }
245 let read_name_str = String::from_utf8_lossy(header);
246 panic!(
247 "Read {} had too few bases to demux {} vs. {} needed in read structure {}.",
248 read_name_str,
249 sequence.len(),
250 min_len,
251 read_structure
252 );
253 }
254
255 for (segment_index, read_segment) in read_structure.iter().enumerate() {
257 let (seq, quals) =
258 read_segment.extract_bases_and_quals(sequence, quality).unwrap_or_else(|e| {
259 let read_name_str = String::from_utf8_lossy(header);
260 panic!(
261 "Error extracting bases (len: {}) or quals (len: {}) for the {}th \
262 segment ({}) in read structure ({}) from record {}; {}",
263 sequence.len(),
264 quality.len(),
265 segment_index,
266 read_segment,
267 read_structure,
268 read_name_str,
269 e
270 );
271 });
272
273 segments.push(FastqSegment {
274 seq: seq.to_vec(),
275 quals: quals.to_vec(),
276 segment_type: read_segment.kind,
277 });
278 }
279
280 Self { header: header.to_vec(), segments, skip_reason: None }
281 }
282}
283
284pub struct ReadSetIterator {
307 read_structure: ReadStructure,
309 source: FastqReader<Box<dyn BufRead + Send>>,
311 skip_reasons: Vec<SkipReason>,
313}
314
315impl Iterator for ReadSetIterator {
316 type Item = FastqSet;
317
318 fn next(&mut self) -> Option<Self::Item> {
319 let rec = self.source.next()?;
320 let record = match rec {
321 Ok(record) => record,
322 Err(e) => {
323 panic!("Error parsing FASTQ record: {e}");
324 }
325 };
326 Some(FastqSet::from_record_with_structure(
327 record.head(),
328 record.seq(),
329 record.qual(),
330 &self.read_structure,
331 &self.skip_reasons,
332 ))
333 }
334}
335
336impl ReadSetIterator {
337 #[must_use]
361 pub fn new(
362 read_structure: ReadStructure,
363 source: FastqReader<Box<dyn BufRead + Send>>,
364 skip_reasons: Vec<SkipReason>,
365 ) -> Self {
366 Self { read_structure, source, skip_reasons }
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373
374 fn read_set(segments: Vec<FastqSegment>) -> FastqSet {
375 FastqSet { header: "NOT_IMPORTANT".as_bytes().to_owned(), segments, skip_reason: None }
376 }
377
378 fn seg(bases: &[u8], segment_type: SegmentType) -> FastqSegment {
379 let quals = vec![b'#'; bases.len()];
380 FastqSegment { seq: bases.to_vec(), quals, segment_type }
381 }
382
383 #[test]
384 fn test_template_segments() {
385 let segments = vec![
386 seg("AGCT".as_bytes(), SegmentType::SampleBarcode),
387 seg("GATA".as_bytes(), SegmentType::Template),
388 seg("CAC".as_bytes(), SegmentType::Template),
389 seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
390 ];
391 let expected = vec![
392 seg("GATA".as_bytes(), SegmentType::Template),
393 seg("CAC".as_bytes(), SegmentType::Template),
394 ];
395
396 let read_set = read_set(segments);
397
398 assert_eq!(expected, read_set.template_segments().cloned().collect::<Vec<_>>());
399 }
400 #[test]
401 fn test_sample_barcode_segments() {
402 let segments = vec![
403 seg("AGCT".as_bytes(), SegmentType::Template),
404 seg("GATA".as_bytes(), SegmentType::SampleBarcode),
405 seg("CAC".as_bytes(), SegmentType::SampleBarcode),
406 seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
407 ];
408 let expected = vec![
409 seg("GATA".as_bytes(), SegmentType::SampleBarcode),
410 seg("CAC".as_bytes(), SegmentType::SampleBarcode),
411 ];
412
413 let read_set = read_set(segments);
414
415 assert_eq!(expected, read_set.sample_barcode_segments().cloned().collect::<Vec<_>>());
416 }
417 #[test]
418 fn test_molecular_barcode_segments() {
419 let segments = vec![
420 seg("AGCT".as_bytes(), SegmentType::Template),
421 seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
422 seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
423 seg("GACCCC".as_bytes(), SegmentType::SampleBarcode),
424 ];
425 let expected = vec![
426 seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
427 seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
428 ];
429
430 let read_set = read_set(segments);
431
432 assert_eq!(expected, read_set.molecular_barcode_segments().cloned().collect::<Vec<_>>());
433 }
434
435 #[test]
436 fn test_combine_readsets() {
437 let segments1 = vec![
438 seg("A".as_bytes(), SegmentType::Template),
439 seg("G".as_bytes(), SegmentType::Template),
440 seg("C".as_bytes(), SegmentType::MolecularBarcode),
441 seg("T".as_bytes(), SegmentType::SampleBarcode),
442 ];
443 let read_set1 = read_set(segments1.clone());
444 let segments2 = vec![
445 seg("AA".as_bytes(), SegmentType::Template),
446 seg("AG".as_bytes(), SegmentType::Template),
447 seg("AC".as_bytes(), SegmentType::MolecularBarcode),
448 seg("AT".as_bytes(), SegmentType::SampleBarcode),
449 ];
450 let read_set2 = read_set(segments2.clone());
451 let segments3 = vec![
452 seg("AAA".as_bytes(), SegmentType::Template),
453 seg("AAG".as_bytes(), SegmentType::Template),
454 seg("AAC".as_bytes(), SegmentType::MolecularBarcode),
455 seg("AAT".as_bytes(), SegmentType::SampleBarcode),
456 ];
457 let read_set3 = read_set(segments3.clone());
458
459 let mut expected_segments = Vec::new();
460 expected_segments.extend(segments1);
461 expected_segments.extend(segments2);
462 expected_segments.extend(segments3);
463 let expected = read_set(expected_segments);
464
465 let result = FastqSet::combine_readsets(vec![read_set1, read_set2, read_set3]);
466
467 assert_eq!(result, expected);
468 }
469
470 #[test]
471 #[should_panic(expected = "Cannot call combine readsets on an empty vec!")]
472 fn test_combine_readsets_fails_on_empty_vector() {
473 let _result = FastqSet::combine_readsets(Vec::new());
474 }
475
476 #[test]
481 fn test_cell_barcode_segments() {
482 let segments = vec![
483 seg("AGCT".as_bytes(), SegmentType::Template),
484 seg("GATA".as_bytes(), SegmentType::CellularBarcode),
485 seg("CAC".as_bytes(), SegmentType::CellularBarcode),
486 seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
487 ];
488 let expected = vec![
489 seg("GATA".as_bytes(), SegmentType::CellularBarcode),
490 seg("CAC".as_bytes(), SegmentType::CellularBarcode),
491 ];
492
493 let read_set = read_set(segments);
494
495 assert_eq!(expected, read_set.cell_barcode_segments().cloned().collect::<Vec<_>>());
496 }
497
498 #[test]
499 fn test_cell_barcode_segments_empty() {
500 let segments = vec![
501 seg("AGCT".as_bytes(), SegmentType::Template),
502 seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
503 ];
504
505 let read_set = read_set(segments);
506
507 assert_eq!(0, read_set.cell_barcode_segments().count());
508 }
509
510 #[test]
515 fn test_skip_reason_display() {
516 assert_eq!(SkipReason::TooFewBases.to_string(), "Too few bases");
517 }
518
519 #[test]
520 fn test_skip_reason_from_str_valid() {
521 assert_eq!(SkipReason::from_str("too few bases").unwrap(), SkipReason::TooFewBases);
523 assert_eq!(SkipReason::from_str("too-few-bases").unwrap(), SkipReason::TooFewBases);
524 assert_eq!(SkipReason::from_str("toofewbases").unwrap(), SkipReason::TooFewBases);
525 }
526
527 #[test]
528 fn test_skip_reason_from_str_invalid() {
529 let result = SkipReason::from_str("invalid-reason");
530 assert!(result.is_err());
531 let err = result.unwrap_err();
532 assert!(err.to_string().contains("Invalid skip reason"));
533 assert!(err.to_string().contains("invalid-reason"));
534 }
535
536 #[test]
541 fn test_read_set_iterator_basic() {
542 use std::io::Cursor;
543
544 let fastq_data = b"@read1\nACGTACGT\n+\nIIIIIIII\n";
546 let cursor = Cursor::new(fastq_data.to_vec());
547 let reader: FastqReader<Box<dyn BufRead + Send>> = FastqReader::new(Box::new(cursor));
548
549 let read_structure = ReadStructure::from_str("4M4T").unwrap();
551 let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
552
553 let result = iterator.next();
554 assert!(result.is_some());
555
556 let fastq_set = result.unwrap();
557 assert_eq!(fastq_set.header, b"read1");
558 assert_eq!(fastq_set.segments.len(), 2);
559 assert!(fastq_set.skip_reason.is_none());
560
561 assert_eq!(fastq_set.segments[0].seq, b"ACGT");
563 assert_eq!(fastq_set.segments[0].segment_type, SegmentType::MolecularBarcode);
564
565 assert_eq!(fastq_set.segments[1].seq, b"ACGT");
567 assert_eq!(fastq_set.segments[1].segment_type, SegmentType::Template);
568
569 assert!(iterator.next().is_none());
571 }
572
573 #[test]
574 fn test_read_set_iterator_skip_too_few_bases() {
575 use std::io::Cursor;
576
577 let fastq_data = b"@read1\nACGT\n+\nIIII\n";
579 let cursor = Cursor::new(fastq_data.to_vec());
580 let reader: FastqReader<Box<dyn BufRead + Send>> = FastqReader::new(Box::new(cursor));
581
582 let read_structure = ReadStructure::from_str("4M6T").unwrap();
584 let mut iterator =
585 ReadSetIterator::new(read_structure, reader, vec![SkipReason::TooFewBases]);
586
587 let result = iterator.next();
588 assert!(result.is_some());
589
590 let fastq_set = result.unwrap();
591 assert_eq!(fastq_set.header, b"read1");
592 assert!(fastq_set.segments.is_empty());
593 assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
594 }
595
596 #[test]
597 #[should_panic(expected = "too few bases")]
598 fn test_read_set_iterator_panic_on_too_few_bases_without_skip() {
599 use std::io::Cursor;
600
601 let fastq_data = b"@read1\nACGT\n+\nIIII\n";
603 let cursor = Cursor::new(fastq_data.to_vec());
604 let reader: FastqReader<Box<dyn BufRead + Send>> = FastqReader::new(Box::new(cursor));
605
606 let read_structure = ReadStructure::from_str("4M6T").unwrap();
608 let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
609
610 let _ = iterator.next();
612 }
613
614 #[test]
615 fn test_read_set_iterator_multiple_records() {
616 use std::io::Cursor;
617
618 let fastq_data = b"@read1\nACGTAAAA\n+\nIIIIIIII\n@read2\nTGCATTTT\n+\nIIIIIIII\n";
619 let cursor = Cursor::new(fastq_data.to_vec());
620 let reader: FastqReader<Box<dyn BufRead + Send>> = FastqReader::new(Box::new(cursor));
621
622 let read_structure = ReadStructure::from_str("4M4T").unwrap();
623 let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
624
625 let first = iterator.next().unwrap();
627 assert_eq!(first.header, b"read1");
628 assert_eq!(first.segments[0].seq, b"ACGT");
629 assert_eq!(first.segments[1].seq, b"AAAA");
630
631 let second = iterator.next().unwrap();
633 assert_eq!(second.header, b"read2");
634 assert_eq!(second.segments[0].seq, b"TGCA");
635 assert_eq!(second.segments[1].seq, b"TTTT");
636
637 assert!(iterator.next().is_none());
639 }
640
641 #[test]
642 fn test_read_set_iterator_variable_length_segment() {
643 use std::io::Cursor;
644
645 let fastq_data = b"@read1\nACGTTTTTTTTT\n+\nIIIIIIIIIIII\n";
647 let cursor = Cursor::new(fastq_data.to_vec());
648 let reader: FastqReader<Box<dyn BufRead + Send>> = FastqReader::new(Box::new(cursor));
649
650 let read_structure = ReadStructure::from_str("4M+T").unwrap();
652 let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
653
654 let result = iterator.next().unwrap();
655 assert_eq!(result.segments.len(), 2);
656
657 assert_eq!(result.segments[0].seq, b"ACGT");
659 assert_eq!(result.segments[0].segment_type, SegmentType::MolecularBarcode);
660
661 assert_eq!(result.segments[1].seq, b"TTTTTTTT");
663 assert_eq!(result.segments[1].segment_type, SegmentType::Template);
664 }
665
666 #[test]
667 fn test_fastq_set_with_skip_reason_only() {
668 let fastq_set = FastqSet {
670 header: b"skipped_read".to_vec(),
671 segments: vec![],
672 skip_reason: Some(SkipReason::TooFewBases),
673 };
674
675 assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
676 assert!(fastq_set.segments.is_empty());
677 assert_eq!(fastq_set.template_segments().count(), 0);
678 assert_eq!(fastq_set.molecular_barcode_segments().count(), 0);
679 assert_eq!(fastq_set.sample_barcode_segments().count(), 0);
680 assert_eq!(fastq_set.cell_barcode_segments().count(), 0);
681 }
682}