use anyhow::{Result, anyhow};
use fgumi_simd_fastq::SimdFastqReader;
use read_structure::ReadStructure;
use read_structure::SegmentType;
use std::fmt::Display;
use std::io::BufRead;
use std::iter::Filter;
use std::slice::Iter;
use std::str::FromStr;
type SegmentIter<'a> = Filter<Iter<'a, FastqSegment>, fn(&&FastqSegment) -> bool>;
#[derive(PartialEq, Eq, Debug, Clone)]
pub struct FastqSegment {
pub seq: Vec<u8>,
pub quals: Vec<u8>,
pub segment_type: SegmentType,
}
#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
pub enum SkipReason {
TooFewBases,
}
impl Display for SkipReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SkipReason::TooFewBases => write!(f, "Too few bases"),
}
}
}
impl FromStr for SkipReason {
type Err = anyhow::Error;
fn from_str(string: &str) -> Result<Self, Self::Err> {
match string {
"too few bases" | "too-few-bases" | "toofewbases" => Ok(SkipReason::TooFewBases),
_ => Err(anyhow!(
"Invalid skip reason: '{string}' (valid values: 'too-few-bases', 'toofewbases')"
)),
}
}
}
#[derive(PartialEq, Debug, Clone)]
pub struct FastqSet {
pub header: Vec<u8>,
pub segments: Vec<FastqSegment>,
pub skip_reason: Option<SkipReason>,
}
impl FastqSet {
pub fn template_segments(&self) -> SegmentIter<'_> {
self.segments.iter().filter(|s| s.segment_type == SegmentType::Template)
}
pub fn sample_barcode_segments(&self) -> SegmentIter<'_> {
self.segments.iter().filter(|s| s.segment_type == SegmentType::SampleBarcode)
}
pub fn molecular_barcode_segments(&self) -> SegmentIter<'_> {
self.segments.iter().filter(|s| s.segment_type == SegmentType::MolecularBarcode)
}
pub fn cell_barcode_segments(&self) -> SegmentIter<'_> {
self.segments.iter().filter(|s| s.segment_type == SegmentType::CellularBarcode)
}
#[must_use]
pub fn combine_readsets(readsets: Vec<Self>) -> Self {
let total_segments: usize = readsets.iter().map(|s| s.segments.len()).sum();
assert!(total_segments > 0, "Cannot call combine readsets on an empty vec!");
let mut readset_iter = readsets.into_iter();
let mut first = readset_iter
.next()
.expect("readset_iter is non-empty because total_segments > 0 was checked");
first.segments.reserve_exact(total_segments - first.segments.len());
for next_readset in readset_iter {
first.segments.extend(next_readset.segments);
}
first
}
pub fn from_record_with_structure(
header: &[u8],
sequence: &[u8],
quality: &[u8],
read_structure: &ReadStructure,
skip_reasons: &[SkipReason],
) -> anyhow::Result<Self> {
let mut segments = Vec::with_capacity(read_structure.number_of_segments());
let min_len: usize = read_structure.iter().map(|s| s.length().unwrap_or(0)).sum();
if sequence.len() < min_len {
if skip_reasons.contains(&SkipReason::TooFewBases) {
return Ok(Self {
header: header.to_vec(),
segments: vec![],
skip_reason: Some(SkipReason::TooFewBases),
});
}
let read_name_str = String::from_utf8_lossy(header);
anyhow::bail!(
"Read {} had too few bases to demux {} vs. {} needed in read structure {}.",
read_name_str,
sequence.len(),
min_len,
read_structure
);
}
for (segment_index, read_segment) in read_structure.iter().enumerate() {
let (seq, quals) =
read_segment.extract_bases_and_quals(sequence, quality).map_err(|e| {
let read_name_str = String::from_utf8_lossy(header);
anyhow::anyhow!(
"Error extracting bases (len: {}) or quals (len: {}) for the {}th \
segment ({}) in read structure ({}) from record {}; {}",
sequence.len(),
quality.len(),
segment_index,
read_segment,
read_structure,
read_name_str,
e
)
})?;
segments.push(FastqSegment {
seq: seq.to_vec(),
quals: quals.to_vec(),
segment_type: read_segment.kind,
});
}
Ok(Self { header: header.to_vec(), segments, skip_reason: None })
}
}
pub struct ReadSetIterator {
read_structure: ReadStructure,
source: SimdFastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
}
impl Iterator for ReadSetIterator {
type Item = anyhow::Result<FastqSet>;
fn next(&mut self) -> Option<Self::Item> {
let rec = self.source.next()?;
let record = match rec {
Ok(record) => record,
Err(e) => {
return Some(Err(anyhow::Error::from(e).context("Error parsing FASTQ record")));
}
};
Some(FastqSet::from_record_with_structure(
&record.name,
&record.sequence,
&record.quality,
&self.read_structure,
&self.skip_reasons,
))
}
}
impl ReadSetIterator {
#[must_use]
pub fn new(
read_structure: ReadStructure,
source: SimdFastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
) -> Self {
Self { read_structure, source, skip_reasons }
}
}
#[cfg(test)]
mod tests {
use super::*;
fn read_set(segments: Vec<FastqSegment>) -> FastqSet {
FastqSet { header: "NOT_IMPORTANT".as_bytes().to_owned(), segments, skip_reason: None }
}
fn seg(bases: &[u8], segment_type: SegmentType) -> FastqSegment {
let quals = vec![b'#'; bases.len()];
FastqSegment { seq: bases.to_vec(), quals, segment_type }
}
#[test]
fn test_template_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::SampleBarcode),
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.template_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_sample_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.sample_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_molecular_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
seg("GACCCC".as_bytes(), SegmentType::SampleBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.molecular_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_combine_readsets() {
let segments1 = vec![
seg("A".as_bytes(), SegmentType::Template),
seg("G".as_bytes(), SegmentType::Template),
seg("C".as_bytes(), SegmentType::MolecularBarcode),
seg("T".as_bytes(), SegmentType::SampleBarcode),
];
let read_set1 = read_set(segments1.clone());
let segments2 = vec![
seg("AA".as_bytes(), SegmentType::Template),
seg("AG".as_bytes(), SegmentType::Template),
seg("AC".as_bytes(), SegmentType::MolecularBarcode),
seg("AT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set2 = read_set(segments2.clone());
let segments3 = vec![
seg("AAA".as_bytes(), SegmentType::Template),
seg("AAG".as_bytes(), SegmentType::Template),
seg("AAC".as_bytes(), SegmentType::MolecularBarcode),
seg("AAT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set3 = read_set(segments3.clone());
let mut expected_segments = Vec::new();
expected_segments.extend(segments1);
expected_segments.extend(segments2);
expected_segments.extend(segments3);
let expected = read_set(expected_segments);
let result = FastqSet::combine_readsets(vec![read_set1, read_set2, read_set3]);
assert_eq!(result, expected);
}
#[test]
#[should_panic(expected = "Cannot call combine readsets on an empty vec!")]
fn test_combine_readsets_fails_on_empty_vector() {
let _result = FastqSet::combine_readsets(Vec::new());
}
#[test]
fn test_cell_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::CellularBarcode),
seg("CAC".as_bytes(), SegmentType::CellularBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::CellularBarcode),
seg("CAC".as_bytes(), SegmentType::CellularBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.cell_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_cell_barcode_segments_empty() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(0, read_set.cell_barcode_segments().count());
}
#[test]
fn test_skip_reason_display() {
assert_eq!(SkipReason::TooFewBases.to_string(), "Too few bases");
}
#[test]
fn test_skip_reason_from_str_valid() {
assert_eq!(
SkipReason::from_str("too few bases")
.expect("parsing \"too few bases\" should succeed"),
SkipReason::TooFewBases
);
assert_eq!(
SkipReason::from_str("too-few-bases")
.expect("parsing \"too-few-bases\" should succeed"),
SkipReason::TooFewBases
);
assert_eq!(
SkipReason::from_str("toofewbases").expect("parsing \"toofewbases\" should succeed"),
SkipReason::TooFewBases
);
}
#[test]
fn test_skip_reason_from_str_invalid() {
let result = SkipReason::from_str("invalid-reason");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("Invalid skip reason"));
assert!(err.to_string().contains("invalid-reason"));
}
#[test]
fn test_read_set_iterator_basic() {
use std::io::Cursor;
let fastq_data = b"@read1\nACGTACGT\n+\nIIIIIIII\n";
let cursor = Cursor::new(fastq_data.to_vec());
let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);
let read_structure =
ReadStructure::from_str("4M4T").expect("parsing \"4M4T\" should succeed");
let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
let result = iterator.next();
assert!(result.is_some());
let fastq_set =
result.expect("iterator should yield a record").expect("reading record should succeed");
assert_eq!(fastq_set.header, b"read1");
assert_eq!(fastq_set.segments.len(), 2);
assert!(fastq_set.skip_reason.is_none());
assert_eq!(fastq_set.segments[0].seq, b"ACGT");
assert_eq!(fastq_set.segments[0].segment_type, SegmentType::MolecularBarcode);
assert_eq!(fastq_set.segments[1].seq, b"ACGT");
assert_eq!(fastq_set.segments[1].segment_type, SegmentType::Template);
assert!(iterator.next().is_none());
}
#[test]
fn test_read_set_iterator_skip_too_few_bases() {
use std::io::Cursor;
let fastq_data = b"@read1\nACGT\n+\nIIII\n";
let cursor = Cursor::new(fastq_data.to_vec());
let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);
let read_structure =
ReadStructure::from_str("4M6T").expect("parsing \"4M6T\" should succeed");
let mut iterator =
ReadSetIterator::new(read_structure, reader, vec![SkipReason::TooFewBases]);
let result = iterator.next();
assert!(result.is_some());
let fastq_set =
result.expect("iterator should yield a record").expect("reading record should succeed");
assert_eq!(fastq_set.header, b"read1");
assert!(fastq_set.segments.is_empty());
assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
}
#[test]
fn test_read_set_iterator_error_on_too_few_bases_without_skip() {
use std::io::Cursor;
let fastq_data = b"@read1\nACGT\n+\nIIII\n";
let cursor = Cursor::new(fastq_data.to_vec());
let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);
let read_structure =
ReadStructure::from_str("4M6T").expect("parsing \"4M6T\" should succeed");
let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
let result = iterator.next();
assert!(result.is_some());
let err = result.unwrap().unwrap_err();
assert!(err.to_string().contains("too few bases"));
}
#[test]
fn test_read_set_iterator_multiple_records() {
use std::io::Cursor;
let fastq_data = b"@read1\nACGTAAAA\n+\nIIIIIIII\n@read2\nTGCATTTT\n+\nIIIIIIII\n";
let cursor = Cursor::new(fastq_data.to_vec());
let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);
let read_structure =
ReadStructure::from_str("4M4T").expect("parsing \"4M4T\" should succeed");
let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
let first = iterator
.next()
.expect("iterator should yield first record")
.expect("reading first record should succeed");
assert_eq!(first.header, b"read1");
assert_eq!(first.segments[0].seq, b"ACGT");
assert_eq!(first.segments[1].seq, b"AAAA");
let second = iterator
.next()
.expect("iterator should yield second record")
.expect("reading second record should succeed");
assert_eq!(second.header, b"read2");
assert_eq!(second.segments[0].seq, b"TGCA");
assert_eq!(second.segments[1].seq, b"TTTT");
assert!(iterator.next().is_none());
}
#[test]
fn test_read_set_iterator_variable_length_segment() {
use std::io::Cursor;
let fastq_data = b"@read1\nACGTTTTTTTTT\n+\nIIIIIIIIIIII\n";
let cursor = Cursor::new(fastq_data.to_vec());
let reader = SimdFastqReader::new(Box::new(cursor) as Box<dyn BufRead + Send>);
let read_structure =
ReadStructure::from_str("4M+T").expect("parsing \"4M+T\" should succeed");
let mut iterator = ReadSetIterator::new(read_structure, reader, vec![]);
let result = iterator
.next()
.expect("iterator should yield a record")
.expect("reading record should succeed");
assert_eq!(result.segments.len(), 2);
assert_eq!(result.segments[0].seq, b"ACGT");
assert_eq!(result.segments[0].segment_type, SegmentType::MolecularBarcode);
assert_eq!(result.segments[1].seq, b"TTTTTTTT");
assert_eq!(result.segments[1].segment_type, SegmentType::Template);
}
#[test]
fn test_fastq_set_with_skip_reason_only() {
let fastq_set = FastqSet {
header: b"skipped_read".to_vec(),
segments: vec![],
skip_reason: Some(SkipReason::TooFewBases),
};
assert_eq!(fastq_set.skip_reason, Some(SkipReason::TooFewBases));
assert!(fastq_set.segments.is_empty());
assert_eq!(fastq_set.template_segments().count(), 0);
assert_eq!(fastq_set.molecular_barcode_segments().count(), 0);
assert_eq!(fastq_set.sample_barcode_segments().count(), 0);
assert_eq!(fastq_set.cell_barcode_segments().count(), 0);
}
}