use crate::commands::command::Command;
use anyhow::{anyhow, ensure, Result};
use bstr::ByteSlice;
use clap::Parser;
use fgoxide::io::{DelimFile, Io};
use fgoxide::iter::IntoChunkedReadAheadIterator;
use fqtk_lib::barcode_matching::BarcodeMatcher;
use fqtk_lib::samples::SampleGroup;
use itertools::Itertools;
use log::info;
use pooled_writer::{bgzf::BgzfCompressor, Pool, PoolBuilder, PooledWriter};
use proglog::{CountFormatterKind, ProgLogBuilder};
use read_structure::ReadStructure;
use read_structure::ReadStructureError;
use read_structure::SegmentType;
use seq_io::fastq::Reader as FastqReader;
use seq_io::fastq::Record;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fmt::Display;
use std::fs::File;
use std::io::{BufRead, BufWriter, Write};
use std::iter::Filter;
use std::slice::Iter;
use std::str::FromStr;
use std::{
fs,
path::{Path, PathBuf},
};
type VecOfReaders = Vec<Box<dyn BufRead + Send>>;
type SegmentIter<'a> = Filter<Iter<'a, FastqSegment>, fn(&&FastqSegment) -> bool>;
const BUFFER_SIZE: usize = 1024 * 1024;
#[derive(PartialEq, Eq, Debug, Clone)]
struct FastqSegment {
seq: Vec<u8>,
quals: Vec<u8>,
segment_type: SegmentType,
}
#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
enum SkipReason {
TooFewBases,
}
impl Display for SkipReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SkipReason::TooFewBases => write!(f, "Too few bases"),
}
}
}
impl FromStr for SkipReason {
type Err = anyhow::Error;
fn from_str(string: &str) -> Result<Self, Self::Err> {
match string {
"too few bases" | "too-few-bases" | "toofewbases" => Ok(SkipReason::TooFewBases),
_ => Err(anyhow!("Invalid skip reason: {}", string)),
}
}
}
#[derive(PartialEq, Debug, Clone)]
struct ReadSet {
header: Vec<u8>,
segments: Vec<FastqSegment>,
skip_reason: Option<SkipReason>,
}
impl ReadSet {
const PREFIX: u8 = b'@';
const SPACE: u8 = b' ';
const COLON: u8 = b':';
const PLUS: u8 = b'+';
fn template_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::Template)
}
fn sample_barcode_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::SampleBarcode)
}
fn molecular_barcode_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::MolecularBarcode)
}
fn sample_barcode_sequence(&self) -> Vec<u8> {
self.sample_barcode_segments().flat_map(|s| &s.seq).copied().collect()
}
fn combine_readsets(readsets: Vec<Self>) -> Self {
let total_segments: usize = readsets.iter().map(|s| s.segments.len()).sum();
assert!(total_segments > 0, "Cannot call combine readsets on an empty vec!");
let mut readset_iter = readsets.into_iter();
let mut first = readset_iter.next().expect("Cannot call combine readsets on an empty vec!");
first.segments.reserve_exact(total_segments - first.segments.len());
for next_readset in readset_iter {
first.segments.extend(next_readset.segments);
}
first
}
fn write_header<W: Write>(&self, writer: &mut W, read_num: usize) -> Result<()> {
Self::write_header_internal(
writer,
read_num,
self.header.as_slice(),
self.sample_barcode_segments(),
self.molecular_barcode_segments(),
)
}
fn write_header_internal<W: Write>(
writer: &mut W,
read_num: usize,
header: &[u8],
sample_barcode_segments: SegmentIter,
mut molecular_barcode_segments: SegmentIter,
) -> Result<()> {
let (name, comment) = match header.find_byte(Self::SPACE) {
Some(x) => (&header[0..x], Some(&header[x + 1..])),
None => (header, None),
};
writer.write_all(&[Self::PREFIX])?;
if let Some(first_seg) = molecular_barcode_segments.next() {
let sep_count = name.iter().filter(|c| **c == Self::COLON).count();
ensure!(
sep_count <= 7,
"Can't handle read name with more than 8 segments: {}",
String::from_utf8(header.to_vec())?
);
writer.write_all(name)?;
if sep_count == 7 {
writer.write_all(&[Self::PLUS])?;
} else {
writer.write_all(&[Self::COLON])?;
}
writer.write_all(first_seg.seq.as_slice())?;
for seg in molecular_barcode_segments {
writer.write_all(&[Self::PLUS])?;
writer.write_all(seg.seq.as_slice())?;
}
} else {
writer.write_all(name)?;
}
writer.write_all(&[Self::SPACE])?;
match comment {
None => {
write!(writer, "{}:N:0:", read_num)?;
}
Some(chars) => {
let sep_count = chars.iter().filter(|c| **c == Self::COLON).count();
if sep_count < 3 {
writer.write_all(chars)?;
if *chars.last().unwrap() != Self::COLON {
writer.write_all(&[Self::COLON])?;
}
} else {
ensure!(
sep_count == 3,
"Comment in did not have 4 segments: {}",
String::from_utf8(header.to_vec())?
);
let first_colon_idx = chars.iter().position(|ch| *ch == Self::COLON).unwrap();
let remainder = if chars.last().unwrap().is_ascii_digit() {
&chars[first_colon_idx + 1..chars.len() - 1]
} else {
&chars[first_colon_idx + 1..chars.len()]
};
write!(writer, "{}:", read_num)?;
writer.write_all(remainder)?;
if *remainder.last().unwrap() != Self::COLON {
writer.write_all(&[Self::PLUS])?;
}
}
}
}
for (idx, seg) in sample_barcode_segments.enumerate() {
if idx > 0 {
writer.write_all(&[Self::PLUS])?;
}
writer.write_all(seg.seq.as_slice())?;
}
Ok(())
}
}
struct ReadSetIterator {
read_structure: ReadStructure,
source: FastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
}
impl Iterator for ReadSetIterator {
type Item = ReadSet;
fn next(&mut self) -> Option<Self::Item> {
if let Some(rec) = self.source.next() {
let mut segments = Vec::with_capacity(self.read_structure.number_of_segments());
let next_fq_rec = rec.expect("Unexpected error parsing FASTQs");
let read_name = next_fq_rec.head();
let bases = next_fq_rec.seq();
let quals = next_fq_rec.qual();
let min_len = self.read_structure.iter().map(|s| s.length().unwrap_or(1)).sum();
if bases.len() < min_len {
if self.skip_reasons.contains(&SkipReason::TooFewBases) {
return Some(ReadSet {
header: read_name.to_vec(),
segments: vec![],
skip_reason: Some(SkipReason::TooFewBases),
});
}
panic!(
"Read {} had too few bases to demux {} vs. {} needed in read structure {}.",
String::from_utf8(read_name.to_vec()).unwrap(),
bases.len(),
min_len,
self.read_structure
);
}
for (read_segment_index, read_segment) in self.read_structure.iter().enumerate() {
let (seq, quals) =
read_segment.extract_bases_and_quals(bases, quals).unwrap_or_else(|e| {
panic!(
"Error extracting bases (len: {}) or quals (len: {}) for the {}th read segment ({}) in read structure ({}) from FASTQ record with name {}; {}",
bases.len(),
quals.len(),
read_segment_index,
read_segment,
self.read_structure,
String::from_utf8(read_name.to_vec()).unwrap(),
e
)
});
segments.push(FastqSegment {
seq: seq.to_vec(),
quals: quals.to_vec(),
segment_type: read_segment.kind,
});
}
Some(ReadSet { header: read_name.to_vec(), segments, skip_reason: None })
} else {
None
}
}
}
impl ReadSetIterator {
pub fn new(
read_structure: ReadStructure,
source: FastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
) -> Self {
Self { read_structure, source, skip_reasons }
}
}
struct SampleWriters<W: Write> {
name: String,
template_writers: Option<Vec<W>>,
sample_barcode_writers: Option<Vec<W>>,
molecular_barcode_writers: Option<Vec<W>>,
}
impl<W: Write> SampleWriters<W> {
#[allow(clippy::type_complexity)]
fn into_parts(self) -> (String, Option<Vec<W>>, Option<Vec<W>>, Option<Vec<W>>) {
(
self.name,
self.template_writers,
self.sample_barcode_writers,
self.molecular_barcode_writers,
)
}
fn write(&mut self, read_set: &ReadSet) -> Result<()> {
for (writers_opt, segments) in [
(&mut self.template_writers, &mut read_set.template_segments()),
(&mut self.sample_barcode_writers, &mut read_set.sample_barcode_segments()),
(&mut self.molecular_barcode_writers, &mut read_set.molecular_barcode_segments()),
] {
if let Some(writers) = writers_opt {
for (read_idx, (writer, segment)) in writers.iter_mut().zip(segments).enumerate() {
read_set.write_header(writer, read_idx + 1)?;
writer.write_all(b"\n")?;
writer.write_all(segment.seq.as_slice())?;
writer.write_all(b"\n+\n")?;
writer.write_all(segment.quals.as_slice())?;
writer.write_all(b"\n")?;
}
}
}
Ok(())
}
}
impl SampleWriters<PooledWriter> {
fn close(self) -> Result<()> {
for writers in
[self.template_writers, self.sample_barcode_writers, self.molecular_barcode_writers]
.into_iter()
.flatten()
{
writers.into_iter().try_for_each(PooledWriter::close)?;
}
Ok(())
}
}
#[allow(clippy::module_name_repetitions)]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DemuxMetric {
sample_id: String,
barcode: String,
templates: usize,
frac_templates: f64,
ratio_to_mean: f64,
ratio_to_best: f64,
}
impl DemuxMetric {
fn new(sample: &str, barcode: &str) -> DemuxMetric {
DemuxMetric {
sample_id: sample.to_string(),
barcode: barcode.to_string(),
templates: 0,
frac_templates: 0.0,
ratio_to_mean: 0.0,
ratio_to_best: 0.0,
}
}
fn update(samples: &mut [DemuxMetric], unmatched: &mut DemuxMetric) {
let sample_total: f64 = samples.iter().map(|s| s.templates as f64).sum();
let total = sample_total + unmatched.templates as f64;
let mean = sample_total / samples.len() as f64;
let best = samples.iter().map(|s| s.templates).max().unwrap_or(0) as f64;
for sample in samples {
sample.frac_templates = sample.templates as f64 / total;
sample.ratio_to_mean = sample.templates as f64 / mean;
sample.ratio_to_best = sample.templates as f64 / best;
}
unmatched.frac_templates = unmatched.templates as f64 / total;
unmatched.ratio_to_mean = unmatched.templates as f64 / mean;
unmatched.ratio_to_best = unmatched.templates as f64 / best;
}
}
#[derive(Parser, Debug)]
#[command(version)]
pub(crate) struct Demux {
#[clap(long, short = 'i', required = true, num_args = 1..)]
inputs: Vec<PathBuf>,
#[clap(long, short = 'r' , required = true, num_args = 1..)]
read_structures: Vec<ReadStructure>,
#[clap(long, short='b', default_value="T", num_args = 1.. )]
output_types: Vec<char>,
#[clap(long, short = 's', required = true)]
sample_metadata: PathBuf,
#[clap(long, short = 'o', required = true)]
output: PathBuf,
#[clap(long, short = 'u', default_value = "unmatched")]
unmatched_prefix: String,
#[clap(long, default_value = "1")]
max_mismatches: usize,
#[clap(long, short = 'd', default_value = "2")]
min_mismatch_delta: usize,
#[clap(long, short = 't', default_value = "8")]
threads: usize,
#[clap(long, short = 'c', default_value = "5")]
compression_level: usize,
#[clap(long, short = 'S')]
skip_reasons: Vec<SkipReason>,
}
impl Demux {
fn create_sample_writers(
read_structures: &[ReadStructure],
prefix: &str,
output_types: &HashSet<SegmentType>,
output_dir: &Path,
) -> Result<SampleWriters<BufWriter<File>>> {
let mut template_writers = None;
let mut sample_barcode_writers = None;
let mut molecular_barcode_writers = None;
for output_type in output_types {
let mut output_type_writers = Vec::new();
let file_type_code = match output_type {
SegmentType::Template => 'R',
SegmentType::SampleBarcode => 'I',
SegmentType::MolecularBarcode => 'U',
_ => 'S',
};
let segment_count: usize =
read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();
for idx in 1..=segment_count {
output_type_writers.push(BufWriter::new(File::create(
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
)?));
}
match output_type {
SegmentType::Template => template_writers = Some(output_type_writers),
SegmentType::SampleBarcode => {
sample_barcode_writers = Some(output_type_writers);
}
SegmentType::MolecularBarcode => {
molecular_barcode_writers = Some(output_type_writers);
}
_ => {}
}
}
Ok(SampleWriters {
name: prefix.to_string(),
template_writers,
sample_barcode_writers,
molecular_barcode_writers,
})
}
fn create_writers(
read_structures: &[ReadStructure],
sample_group: &SampleGroup,
output_types: &HashSet<SegmentType>,
unmatched_prefix: &str,
output_dir: &Path,
) -> Result<Vec<SampleWriters<BufWriter<File>>>> {
let mut samples_writers = Vec::with_capacity(sample_group.samples.len());
for sample in &sample_group.samples {
samples_writers.push(Self::create_sample_writers(
read_structures,
&sample.sample_id,
output_types,
output_dir,
)?);
}
samples_writers.push(Self::create_sample_writers(
read_structures,
unmatched_prefix,
output_types,
output_dir,
)?);
Ok(samples_writers)
}
fn build_writer_pool(
sample_writers: Vec<SampleWriters<BufWriter<File>>>,
compression_level: usize,
threads: usize,
) -> Result<(Pool, Vec<SampleWriters<PooledWriter>>)> {
let mut new_sample_writers = Vec::with_capacity(sample_writers.len());
let mut pool_builder = PoolBuilder::<_, BgzfCompressor>::new()
.threads(threads)
.queue_size(threads * 50)
.compression_level(u8::try_from(compression_level)?)?;
for sample in sample_writers {
let (name, template_writers, barcode_writers, mol_writers) = sample.into_parts();
let mut new_template_writers = None;
let mut new_sample_barcode_writers = None;
let mut new_molecular_barcode_writers = None;
for (optional_ws, target) in vec![
(template_writers, &mut new_template_writers),
(barcode_writers, &mut new_sample_barcode_writers),
(mol_writers, &mut new_molecular_barcode_writers),
] {
if let Some(ws) = optional_ws {
let mut new_writers = Vec::with_capacity(ws.len());
for writer in ws {
new_writers.push(pool_builder.exchange(writer));
}
_ = target.insert(new_writers);
}
}
new_sample_writers.push(SampleWriters {
name,
template_writers: new_template_writers,
sample_barcode_writers: new_sample_barcode_writers,
molecular_barcode_writers: new_molecular_barcode_writers,
});
}
let pool = pool_builder.build()?;
Ok((pool, new_sample_writers))
}
fn validate_and_prepare_inputs(&self) -> Result<(VecOfReaders, HashSet<SegmentType>)> {
let mut constraint_errors = vec![];
if self.inputs.len() != self.read_structures.len() {
let preamble = "The same number of read structures should be given as FASTQs";
let specifics = format!(
"{} read-structures provided for {} FASTQs",
self.read_structures.len(),
self.inputs.len()
);
constraint_errors.push(format!("{preamble} {specifics}"));
}
if !self.output.exists() {
info!("Output directory {:#?} didn't exist, creating it.", self.output);
fs::create_dir_all(&self.output)?;
}
if self.output.metadata()?.permissions().readonly() {
constraint_errors
.push(format!("Ouput directory {:#?} cannot be read-only", self.output));
}
let output_segment_types_result = self
.output_types
.iter()
.map(|&c| SegmentType::try_from(c))
.collect::<Result<HashSet<_>, ReadStructureError>>();
if let Err(e) = &output_segment_types_result {
constraint_errors.push(format!("Error parsing segment types to report: {}", e));
}
for input in &self.inputs {
if !input.exists() {
constraint_errors.push(format!("Provided input file {:#?} doesn't exist", input));
}
}
let fgio = Io::new(5, BUFFER_SIZE);
let fq_readers_result = self
.inputs
.iter()
.map(|p| fgio.new_reader(p))
.collect::<Result<VecOfReaders, fgoxide::FgError>>();
if let Err(e) = &fq_readers_result {
constraint_errors.push(format!("Error opening input files for reading: {}", e));
}
if self.threads < 5 {
constraint_errors
.push(format!("Threads provided {} was too low! Must be 5 or more.", self.threads));
}
if constraint_errors.is_empty() {
let output_segment_types = output_segment_types_result?;
if output_segment_types.is_empty() {
constraint_errors.push(
"No output types requested, must request at least one output segment type."
.to_owned(),
);
} else {
return Ok((fq_readers_result?, output_segment_types));
}
}
let mut details = "Inputs failed validation!\n".to_owned();
for error_reason in constraint_errors {
details.push_str(&format!(" - {}\n", error_reason));
}
Err(anyhow!("The following errors with the input(s) were detected:\n{}", details))
}
}
impl Command for Demux {
#[allow(clippy::too_many_lines)]
fn execute(&self) -> Result<()> {
let (fq_readers, output_segment_types) = self.validate_and_prepare_inputs()?;
let sample_group = SampleGroup::from_file(&self.sample_metadata)?;
info!(
"{} samples loaded from file {:?}",
sample_group.samples.len(),
&self.sample_metadata
);
let fq_sources =
fq_readers.into_iter().map(|fq| FastqReader::with_capacity(fq, BUFFER_SIZE));
let reader_threads = if self.threads <= 6 { 1 } else { 2 };
let main_thread = 1;
let writer_threads = self.threads - main_thread - reader_threads;
let (mut pool, mut sample_writers) = Self::build_writer_pool(
Self::create_writers(
&self.read_structures,
&sample_group,
&output_segment_types,
&self.unmatched_prefix,
&self.output,
)?,
self.compression_level,
writer_threads,
)?;
let unmatched_index = sample_writers.len() - 1;
info!("Created sample and {} writers.", self.unmatched_prefix);
let mut sample_metrics = sample_group
.samples
.iter()
.map(|s| DemuxMetric::new(s.sample_id.as_str(), s.barcode.as_str()))
.collect_vec();
let mut unmatched_metric = DemuxMetric::new(self.unmatched_prefix.as_str(), ".");
let mut barcode_matcher = BarcodeMatcher::new(
&sample_group.samples.iter().map(|s| s.barcode.as_str()).collect::<Vec<_>>(),
u8::try_from(self.max_mismatches)?,
u8::try_from(self.min_mismatch_delta)?,
true,
);
let mut fq_iterators = fq_sources
.zip(self.read_structures.clone().into_iter())
.map(|(source, read_structure)| {
ReadSetIterator::new(read_structure, source, self.skip_reasons.clone())
.read_ahead(1000, 1000)
})
.collect::<Vec<_>>();
let logger = ProgLogBuilder::new()
.name("fqtk")
.noun("records")
.verb("demultiplexed")
.unit(1_000_000)
.count_formatter(CountFormatterKind::Comma)
.level(log::Level::Info)
.build();
let mut skip_reasons: HashMap<SkipReason, usize> = HashMap::new();
loop {
let mut next_read_sets = Vec::with_capacity(fq_iterators.len());
for iter in &mut fq_iterators {
if let Some(rec) = iter.next() {
next_read_sets.push(rec);
}
}
if let Some(reason) = next_read_sets.iter().find_map(|read_set| read_set.skip_reason) {
let previous = skip_reasons.get(&reason).unwrap_or(&0);
skip_reasons.insert(reason, 1 + previous);
continue;
} else if next_read_sets.is_empty() {
break;
}
assert_eq!(
next_read_sets.len(),
fq_iterators.len(),
"FASTQ sources out of sync at records: {:?}",
next_read_sets
);
let read_set = ReadSet::combine_readsets(next_read_sets);
if let Some(barcode_match) = barcode_matcher.assign(&read_set.sample_barcode_sequence())
{
sample_writers[barcode_match.best_match].write(&read_set)?;
sample_metrics[barcode_match.best_match].templates += 1;
} else {
sample_writers[unmatched_index].write(&read_set)?;
unmatched_metric.templates += 1;
}
logger.record();
}
info!("Finished reading input FASTQs.");
sample_writers.into_iter().map(SampleWriters::close).collect::<Result<Vec<_>>>()?;
pool.stop_pool()?;
info!("Output FASTQ writing complete.");
if skip_reasons.is_empty() {
info!("No records were skipped.");
} else {
for (reason, count) in skip_reasons.iter().sorted_by_key(|(_, c)| *c) {
info!("{} records were skipped due to {}", count, reason);
}
}
let metrics_path = self.output.join("demux-metrics.txt");
DemuxMetric::update(sample_metrics.as_mut_slice(), &mut unmatched_metric);
sample_metrics.push(unmatched_metric);
DelimFile::default().write_tsv(&metrics_path, sample_metrics)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use fqtk_lib::samples::Sample;
use rstest::rstest;
use seq_io::fastq::OwnedRecord;
use std::str;
use std::str::FromStr;
use tempfile::TempDir;
const SAMPLE1_BARCODE: &str = "GATTGGG";
fn fq_lines_from_bases(prefix: &str, records_bases: &[&str]) -> Vec<String> {
let mut result = Vec::with_capacity(records_bases.len() * 4);
for (i, &bases) in records_bases.iter().enumerate() {
result.push(format!("@{}_{}", prefix, i));
result.push(bases.to_owned());
result.push("+".to_owned());
result.push(";".repeat(bases.len()));
}
result
}
fn fastq_file(
tmpdir: &TempDir,
filename_prefix: &str,
read_prefix: &str,
records_bases: &[&str],
) -> PathBuf {
let io = Io::default();
let path = tmpdir.path().join(format!("{filename_prefix}.fastq"));
let fastq_lines = fq_lines_from_bases(read_prefix, records_bases);
io.write_lines(&path, fastq_lines).unwrap();
path
}
fn metadata_lines_from_barcodes(barcodes: &[&str]) -> Vec<String> {
let mut result = Vec::with_capacity(barcodes.len() + 1);
result.push(Sample::deserialize_header_line());
for (i, &barcode) in barcodes.iter().enumerate() {
result.push(format!("Sample{:04}\t{}", i, barcode));
}
result
}
fn metadata_file(tmpdir: &TempDir, barcodes: &[&str]) -> PathBuf {
let io = Io::default();
let path = tmpdir.path().join("metadata.tsv");
let metadata_lines = metadata_lines_from_barcodes(barcodes);
io.write_lines(&path, metadata_lines).unwrap();
path
}
fn metadata(tmpdir: &TempDir) -> PathBuf {
metadata_file(tmpdir, &[SAMPLE1_BARCODE])
}
fn read_fastq(file_path: &PathBuf) -> Vec<OwnedRecord> {
let fg_io = Io::default();
FastqReader::new(fg_io.new_reader(file_path).unwrap())
.into_records()
.collect::<Result<Vec<_>, seq_io::fastq::Error>>()
.unwrap()
}
fn assert_equal(actual: &impl seq_io::fastq::Record, expected: &impl seq_io::fastq::Record) {
assert_eq!(
String::from_utf8(actual.head().to_vec()).unwrap(),
String::from_utf8(expected.head().to_vec()).unwrap()
);
assert_eq!(
String::from_utf8(actual.seq().to_vec()).unwrap(),
String::from_utf8(expected.seq().to_vec()).unwrap()
);
assert_eq!(
String::from_utf8(actual.qual().to_vec()).unwrap(),
String::from_utf8(expected.qual().to_vec()).unwrap()
);
}
fn read_set(segments: Vec<FastqSegment>) -> ReadSet {
ReadSet { header: "NOT_IMPORTANT".as_bytes().to_owned(), segments, skip_reason: None }
}
#[test]
fn validate_inputs_can_succeed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[rstest]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
#[case(vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
])]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
#[case(vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
])]
fn test_different_number_of_read_structs_and_inputs_fails(
#[case] read_structures: Vec<ReadStructure>,
) {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "cannot be read-only")]
fn test_read_only_output_dir_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let mut permissions = tmp.path().metadata().unwrap().permissions();
permissions.set_readonly(true);
fs::set_permissions(tmp.path(), permissions.clone()).unwrap();
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
let demux_result = demux_inputs.execute();
permissions.set_readonly(false);
fs::set_permissions(tmp.path(), permissions).unwrap();
demux_result.unwrap();
}
#[test]
#[should_panic(expected = "doesn't exist")]
fn test_inputs_doesnt_exist_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
tmp.path().join("this_file_does_not_exist.fq"),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 2,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "Threads provided 2 was too low!")]
fn test_too_few_threads_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 2,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
fn test_demux_fragment_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B100T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_output_type_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("10M8B100T").unwrap()];
let s1_barcode = "AAAAAAAA";
let s1_umi = "ATCGATCGAT";
let sample_metadata =
metadata_file(&tmp, &[s1_barcode, "CCCCCCCC", "GGGGGGGG", "TTTTTTTT"]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&(s1_umi.to_owned() + &*s1_barcode.to_owned() + &"A".repeat(100))],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B', 'M'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let barcode_output_path = output_dir.join("Sample0000.I1.fq.gz");
let umi_output_path = output_dir.join("Sample0000.U1.fq.gz");
let fq_reads = read_fastq(&output_path);
let barcode_fq_reads = read_fastq(&barcode_output_path);
let umi_fq_reads = read_fastq(&umi_output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
assert_eq!(barcode_fq_reads.len(), 1);
assert_equal(
&barcode_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"AAAAAAAA".to_vec(),
qual: ";".repeat(8).as_bytes().to_vec(),
},
);
assert_eq!(barcode_fq_reads.len(), 1);
assert_equal(
&barcode_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"AAAAAAAA".to_vec(),
qual: ";".repeat(8).as_bytes().to_vec(),
},
);
assert_eq!(umi_fq_reads.len(), 1);
assert_equal(
&umi_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"ATCGATCGAT".to_vec(),
qual: ";".repeat(10).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_with_catchall_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNNNNNN";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 0);
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:NNNNNNN".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_with_ns_in_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNAAAAA";
let s2_barcode = "NNCCCCC";
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[
&("ANAAAAA".to_owned() + &"A".repeat(5)),
&("ANCCCCC".to_owned() + &"C".repeat(5)),
&("NNNAAAA".to_owned() + &"T".repeat(5)),
],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 0,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:ANAAAAA".to_vec(),
seq: "A".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let output_path = output_dir.join("Sample0001.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_1 1:N:0:ANCCCCC".to_vec(),
seq: "C".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 1);
assert_equal(
&unmatched_reads[0],
&OwnedRecord {
head: b"ex_2 1:N:0:NNNAAAA".to_vec(),
seq: "T".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_paired_reads_with_in_line_sample_barcodes() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("8B100T").unwrap(),
ReadStructure::from_str("9B100T").unwrap(),
];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "ex_R1", "ex", &[&(s1_barcode[..8].to_owned() + &"A".repeat(100))]),
fastq_file(&tmp, "ex_R2", "ex", &[&(s1_barcode[8..].to_owned() + &"T".repeat(100))]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "T".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_dual_indexed_paired_end_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("8B").unwrap(),
ReadStructure::from_str("100T").unwrap(),
ReadStructure::from_str("100T").unwrap(),
ReadStructure::from_str("9B").unwrap(),
];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "ex_I1", "ex", &[&s1_barcode[..8]]),
fastq_file(&tmp, "ex_R1", "ex", &[&"A".repeat(100)]),
fastq_file(&tmp, "ex_R2", "ex", &[&"T".repeat(100)]),
fastq_file(&tmp, "ex_I2", "ex", &[&s1_barcode[8..]]),
];
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "T".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_a_wierd_set_of_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("4B4M8S").unwrap(),
ReadStructure::from_str("4B100T").unwrap(),
ReadStructure::from_str("100S3B").unwrap(),
ReadStructure::from_str("6B1S1M1T").unwrap(),
];
let sample1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[sample1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "example_1", "ex", &["AAAACCCCGGGGTTTT"]),
fastq_file(&tmp, "example_2", "ex", &[&"A".repeat(104)]),
fastq_file(&tmp, "example_3", "ex", &[&("T".repeat(100) + "GAT")]),
fastq_file(&tmp, "example_4", "ex", &["TACAGAAAT"]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0:CCCC+A 1:N:0:AAAA+AAAA+GAT+TACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0:CCCC+A 2:N:0:AAAA+AAAA+GAT+TACAGA".to_vec(),
seq: "T".as_bytes().to_vec(),
qual: ";".as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_a_read_structure_with_multiple_templates_in_one_read() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B20T20S20T20S20T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&(s1_barcode.to_owned()
+ &"A".repeat(20)
+ &"C".repeat(20)
+ &"T".repeat(20)
+ &"C".repeat(20)
+ &"G".repeat(20))],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "A".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "T".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
let r3_path = output_dir.join("Sample0000.R3.fq.gz");
let r3_reads = read_fastq(&r3_path);
assert_eq!(r3_reads.len(), 1);
assert_equal(
&r3_reads[0],
&OwnedRecord {
head: b"ex_0 3:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "G".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
}
#[test]
#[should_panic(
expected = "No output types requested, must request at least one output segment type."
)]
fn test_fails_if_zero_read_structures_have_template_bases() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+M").unwrap(),
ReadStructure::from_str("+M").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec![],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
fn test_fails_if_not_enough_fastq_records_are_passed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+T").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
fn test_fails_if_too_many_fastq_records_are_passed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "Read ex_2 had too few bases to demux 0 vs. 1 needed in read structure +T."
)]
fn test_fails_if_reads_too_short() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
let records = vec![
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], vec!["CCCCCCC", SAMPLE1_BARCODE], vec!["", SAMPLE1_BARCODE], vec!["G", SAMPLE1_BARCODE], ];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &[records[0][0], records[1][0], records[2][0]]),
fastq_file(&tmp, "index1", "ex", &[records[0][1], records[1][1], records[2][1]]),
];
let sample_metadata = metadata(&tmp);
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B'],
output: output_dir,
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();
}
#[test]
fn test_skip_reads_too_short() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
let records = vec![
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], vec!["CCCCCCC", SAMPLE1_BARCODE], vec!["", SAMPLE1_BARCODE], vec!["G", SAMPLE1_BARCODE], ];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &[records[0][0], records[1][0], records[2][0]]),
fastq_file(&tmp, "index1", "ex", &[records[0][1], records[1][1], records[2][1]]),
];
let sample_metadata = metadata(&tmp);
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![SkipReason::TooFewBases],
};
demux_inputs.execute().unwrap();
let metrics_path = output_dir.join("demux-metrics.txt");
let metrics: Vec<DemuxMetric> = DelimFile::default().read_tsv(&metrics_path).unwrap();
let demuxed_reads: usize = metrics.iter().map(|m| m.templates).sum();
let sample0000_reads =
metrics.iter().find(|m| m.sample_id == "Sample0000").unwrap().templates;
assert_eq!(demuxed_reads, 2);
assert_eq!(sample0000_reads, 2);
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 2);
let r2_path = output_dir.join("Sample0000.I1.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 2);
}
fn seg(bases: &[u8], segment_type: SegmentType) -> FastqSegment {
let quals = vec![b'#'; bases.len()];
FastqSegment { seq: bases.to_vec(), quals, segment_type }
}
#[test]
fn test_write_header_standard_no_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108 1:N:0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [];
let expected = "@inst:123:ABCDE:1:204:1022:2108 1:N:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_standard_with_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108 1:Y:0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@inst:123:ABCDE:1:204:1022:2108:AACCGGTT 2:Y:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
2,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_append_barcode_and_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108:AAAA 1:Y:0:TTTT";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected =
"@inst:123:ABCDE:1:204:1022:2108:AAAA+AACCGGTT 2:Y:0:TTTT+ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
2,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_short_name_no_comment() {
let mut out = Vec::new();
let header = b"q1";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@q1:AACCGGTT 1:N:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
#[should_panic(expected = "8 segments")]
fn test_write_header_name_too_many_parts() {
let mut out = Vec::new();
let header = b"q1:1:2:3:4:5:6:7:8:9:10";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
}
#[test]
fn test_write_header_comment_too_few_parts() {
let mut out = Vec::new();
let header = b"q1 0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@q1:AACCGGTT 0:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_sample_barcode_sequence() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(read_set.sample_barcode_sequence(), "GATACAC".as_bytes().to_owned());
}
#[test]
fn test_template_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::SampleBarcode),
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.template_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_sample_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.sample_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_molecular_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
seg("GACCCC".as_bytes(), SegmentType::SampleBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.molecular_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_combine_readsets() {
let segments1 = vec![
seg("A".as_bytes(), SegmentType::Template),
seg("G".as_bytes(), SegmentType::Template),
seg("C".as_bytes(), SegmentType::MolecularBarcode),
seg("T".as_bytes(), SegmentType::SampleBarcode),
];
let read_set1 = read_set(segments1.clone());
let segments2 = vec![
seg("AA".as_bytes(), SegmentType::Template),
seg("AG".as_bytes(), SegmentType::Template),
seg("AC".as_bytes(), SegmentType::MolecularBarcode),
seg("AT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set2 = read_set(segments2.clone());
let segments3 = vec![
seg("AAA".as_bytes(), SegmentType::Template),
seg("AAG".as_bytes(), SegmentType::Template),
seg("AAC".as_bytes(), SegmentType::MolecularBarcode),
seg("AAT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set3 = read_set(segments3.clone());
let mut expected_segments = Vec::new();
expected_segments.extend(segments1);
expected_segments.extend(segments2);
expected_segments.extend(segments3);
let expected = read_set(expected_segments);
let result = ReadSet::combine_readsets(vec![read_set1, read_set2, read_set3]);
assert_eq!(result, expected);
}
#[test]
#[should_panic(expected = "Cannot call combine readsets on an empty vec!")]
fn test_combine_readsets_fails_on_empty_vector() {
let _result = ReadSet::combine_readsets(Vec::new());
}
}