use crate::commands::command::Command;
use anyhow::{Result, anyhow, ensure};
use bstr::ByteSlice;
use clap::Parser;
use fgoxide::io::{DelimFile, Io};
use fgoxide::iter::IntoChunkedReadAheadIterator;
use fqtk_lib::barcode_matching::BarcodeMatcher;
use fqtk_lib::samples::SampleGroup;
use itertools::Itertools;
use log::info;
use pooled_writer::{Pool, PoolBuilder, PooledWriter, bgzf::BgzfCompressor};
use proglog::{CountFormatterKind, ProgLogBuilder};
use read_structure::ReadStructure;
use read_structure::ReadStructureError;
use read_structure::SegmentType;
use seq_io::fastq::Reader as FastqReader;
use seq_io::fastq::Record;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fmt::Display;
use std::fs::File;
use std::io::{BufRead, BufWriter, Write};
use std::iter::Filter;
use std::slice::Iter;
use std::str::FromStr;
use std::{
fs,
path::{Path, PathBuf},
};
type VecOfReaders = Vec<Box<dyn BufRead + Send>>;
type SegmentIter<'a> = Filter<Iter<'a, FastqSegment>, fn(&&FastqSegment) -> bool>;
const BUFFER_SIZE: usize = 1024 * 1024;
#[derive(PartialEq, Eq, Debug, Clone)]
struct FastqSegment {
seq: Vec<u8>,
quals: Vec<u8>,
segment_type: SegmentType,
source_index: usize,
}
#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
enum SkipReason {
TooFewBases,
}
impl Display for SkipReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SkipReason::TooFewBases => write!(f, "Too few bases"),
}
}
}
impl FromStr for SkipReason {
type Err = anyhow::Error;
fn from_str(string: &str) -> Result<Self, Self::Err> {
match string {
"too few bases" | "too-few-bases" | "toofewbases" => Ok(SkipReason::TooFewBases),
_ => Err(anyhow!("Invalid skip reason: {}", string)),
}
}
}
#[derive(PartialEq, Eq, Debug, Clone)]
struct RawInput {
bases: Vec<u8>,
quals: Vec<u8>,
}
#[derive(Debug)]
enum ResegmentError {
TooShort(anyhow::Error),
Other(anyhow::Error),
}
const TOO_FEW_BASES_HINT: &str =
"pass `--skip-reasons too-few-bases` to skip reads that are too short instead of aborting";
#[derive(PartialEq, Debug, Clone)]
struct ReadSet {
header: Vec<u8>,
segments: Vec<FastqSegment>,
raw_per_input: Vec<RawInput>,
skip_reason: Option<SkipReason>,
}
impl ReadSet {
const PREFIX: u8 = b'@';
const SPACE: u8 = b' ';
const COLON: u8 = b':';
const PLUS: u8 = b'+';
fn template_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::Template)
}
fn sample_barcode_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::SampleBarcode)
}
fn molecular_barcode_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::MolecularBarcode)
}
fn cellular_barcode_segments(&self) -> SegmentIter {
self.segments.iter().filter(|s| s.segment_type == SegmentType::CellularBarcode)
}
fn sample_barcode_sequence(&self) -> Vec<u8> {
self.sample_barcode_segments().flat_map(|s| &s.seq).copied().collect()
}
fn matching_window_bases(&self, prefix_lens: &[usize]) -> Vec<u8> {
let total: usize = prefix_lens.iter().sum();
let mut out = Vec::with_capacity(total);
for (raw, &plen) in self.raw_per_input.iter().zip(prefix_lens) {
debug_assert!(
raw.bases.len() >= plen,
"raw bases length {} < matching prefix length {}; ReadSetIterator should have \
gated this read",
raw.bases.len(),
plen,
);
out.extend_from_slice(&raw.bases[..plen]);
}
out
}
fn resegment(self, read_structures: &[ReadStructure]) -> Result<Self, ResegmentError> {
assert_eq!(
read_structures.len(),
self.raw_per_input.len(),
"resegment requires one read structure per input FASTQ ({} vs {})",
read_structures.len(),
self.raw_per_input.len(),
);
let total_segments: usize = read_structures.iter().map(|rs| rs.number_of_segments()).sum();
let mut segments = Vec::with_capacity(total_segments);
for (source_index, (raw, rs)) in
self.raw_per_input.iter().zip(read_structures.iter()).enumerate()
{
for seg in rs.iter() {
let (s, q) = match seg.extract_bases_and_quals(&raw.bases, &raw.quals) {
Ok(ok) => ok,
Err(e) => {
let is_length_error = matches!(
e,
ReadStructureError::ReadEndsBeforeSegment(_)
| ReadStructureError::ReadEndsAfterSegment(_)
);
let wrapped = anyhow!(
"Error extracting bases (len: {}) for read structure ({}) from \
FASTQ record {}: {}",
raw.bases.len(),
rs,
String::from_utf8_lossy(&self.header),
e,
);
return Err(if is_length_error {
ResegmentError::TooShort(wrapped)
} else {
ResegmentError::Other(wrapped)
});
}
};
if seg.kind == SegmentType::Template && s.is_empty() {
return Err(ResegmentError::TooShort(anyhow!(
"Read structure ({}) produced a zero-length template for FASTQ record {} \
(len: {})",
rs,
String::from_utf8_lossy(&self.header),
raw.bases.len(),
)));
}
segments.push(FastqSegment {
seq: s.to_vec(),
quals: q.to_vec(),
segment_type: seg.kind,
source_index,
});
}
}
Ok(Self {
header: self.header,
segments,
raw_per_input: Vec::new(),
skip_reason: self.skip_reason,
})
}
fn combine_readsets(readsets: Vec<Self>) -> Self {
assert!(!readsets.is_empty(), "Cannot call combine readsets on an empty vec!");
let total_segments: usize = readsets.iter().map(|s| s.segments.len()).sum();
let mut readset_iter = readsets.into_iter();
let mut first = readset_iter.next().expect("Cannot call combine readsets on an empty vec!");
first.segments.reserve_exact(total_segments - first.segments.len());
first.raw_per_input.reserve_exact(readset_iter.len());
for next_readset in readset_iter {
first.segments.extend(next_readset.segments);
first.raw_per_input.extend(next_readset.raw_per_input);
}
first
}
fn segments_for_source(
&self,
source_index: usize,
template_types: &HashSet<SegmentType>,
) -> (Vec<u8>, Vec<u8>) {
let mut seq = Vec::new();
let mut quals = Vec::new();
for segment in &self.segments {
if segment.source_index == source_index
&& template_types.contains(&segment.segment_type)
{
seq.extend_from_slice(&segment.seq);
quals.extend_from_slice(&segment.quals);
}
}
(seq, quals)
}
fn write_header<W: Write>(
&self,
writer: &mut W,
read_num: usize,
include_umi: bool,
) -> Result<()> {
Self::write_header_internal(
writer,
read_num,
self.header.as_slice(),
self.sample_barcode_segments(),
self.molecular_barcode_segments(),
include_umi,
)
}
fn write_header_internal<W: Write>(
writer: &mut W,
read_num: usize,
header: &[u8],
sample_barcode_segments: SegmentIter,
mut molecular_barcode_segments: SegmentIter,
include_umi: bool,
) -> Result<()> {
let (name, comment) = match header.find_byte(Self::SPACE) {
Some(x) => (&header[0..x], Some(&header[x + 1..])),
None => (header, None),
};
writer.write_all(&[Self::PREFIX])?;
let first_umi_segment = if include_umi { molecular_barcode_segments.next() } else { None };
if let Some(first_seg) = first_umi_segment {
let sep_count = name.iter().filter(|c| **c == Self::COLON).count();
ensure!(
sep_count <= 7,
"Can't handle read name with more than 8 segments: {}",
String::from_utf8(header.to_vec())?
);
writer.write_all(name)?;
if sep_count == 7 {
writer.write_all(&[Self::PLUS])?;
} else {
writer.write_all(&[Self::COLON])?;
}
writer.write_all(first_seg.seq.as_slice())?;
for seg in molecular_barcode_segments {
writer.write_all(&[Self::PLUS])?;
writer.write_all(seg.seq.as_slice())?;
}
} else {
writer.write_all(name)?;
}
writer.write_all(&[Self::SPACE])?;
match comment {
None => {
write!(writer, "{}:N:0:", read_num)?;
}
Some(chars) => {
let sep_count = chars.iter().filter(|c| **c == Self::COLON).count();
if sep_count < 3 {
writer.write_all(chars)?;
if *chars.last().unwrap() != Self::COLON {
writer.write_all(&[Self::COLON])?;
}
} else {
ensure!(
sep_count == 3,
"Comment in did not have 4 segments: {}",
String::from_utf8(header.to_vec())?
);
let first_colon_idx = chars.iter().position(|ch| *ch == Self::COLON).unwrap();
let remainder = if chars.last().unwrap().is_ascii_digit() {
&chars[first_colon_idx + 1..chars.len() - 1]
} else {
&chars[first_colon_idx + 1..chars.len()]
};
write!(writer, "{}:", read_num)?;
writer.write_all(remainder)?;
if *remainder.last().unwrap() != Self::COLON {
writer.write_all(&[Self::PLUS])?;
}
}
}
}
for (idx, seg) in sample_barcode_segments.enumerate() {
if idx > 0 {
writer.write_all(&[Self::PLUS])?;
}
writer.write_all(seg.seq.as_slice())?;
}
Ok(())
}
}
struct ReadSetIterator {
read_structure: ReadStructure,
source: FastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
source_index: usize,
keep_raw: bool,
min_len: usize,
}
impl Iterator for ReadSetIterator {
type Item = ReadSet;
fn next(&mut self) -> Option<Self::Item> {
if let Some(rec) = self.source.next() {
let next_fq_rec = rec.expect("Unexpected error parsing FASTQs");
let read_name = next_fq_rec.head();
let bases = next_fq_rec.seq();
let quals = next_fq_rec.qual();
if bases.len() < self.min_len {
if self.skip_reasons.contains(&SkipReason::TooFewBases) {
return Some(ReadSet {
header: read_name.to_vec(),
segments: vec![],
raw_per_input: vec![],
skip_reason: Some(SkipReason::TooFewBases),
});
}
panic!(
"Read {} had too few bases to demux {} vs. {} needed in read structure {}.",
String::from_utf8(read_name.to_vec()).unwrap(),
bases.len(),
self.min_len,
self.read_structure
);
}
if self.keep_raw {
return Some(ReadSet {
header: read_name.to_vec(),
segments: vec![],
raw_per_input: vec![RawInput { bases: bases.to_vec(), quals: quals.to_vec() }],
skip_reason: None,
});
}
let mut segments = Vec::with_capacity(self.read_structure.number_of_segments());
for (read_segment_index, read_segment) in self.read_structure.iter().enumerate() {
let (seq, quals) =
read_segment.extract_bases_and_quals(bases, quals).unwrap_or_else(|e| {
panic!(
"Error extracting bases (len: {}) or quals (len: {}) for the {}th read segment ({}) in read structure ({}) from FASTQ record with name {}; {}",
bases.len(),
quals.len(),
read_segment_index,
read_segment,
self.read_structure,
String::from_utf8(read_name.to_vec()).unwrap(),
e
)
});
segments.push(FastqSegment {
seq: seq.to_vec(),
quals: quals.to_vec(),
segment_type: read_segment.kind,
source_index: self.source_index,
});
}
Some(ReadSet {
header: read_name.to_vec(),
segments,
raw_per_input: vec![],
skip_reason: None,
})
} else {
None
}
}
}
impl ReadSetIterator {
pub fn new(
read_structure: ReadStructure,
source: FastqReader<Box<dyn BufRead + Send>>,
skip_reasons: Vec<SkipReason>,
source_index: usize,
keep_raw: bool,
min_len: usize,
) -> Self {
Self { read_structure, source, skip_reasons, source_index, keep_raw, min_len }
}
}
struct TemplateOutput {
types: HashSet<SegmentType>,
include_other_segments: bool,
umi_in_header: bool,
}
impl TemplateOutput {
fn from_types(types: HashSet<SegmentType>) -> Self {
let include_other_segments = types.iter().any(|t| *t != SegmentType::Template);
let umi_in_header = !types.contains(&SegmentType::MolecularBarcode);
Self { types, include_other_segments, umi_in_header }
}
}
struct SampleWriters<W: Write> {
name: String,
template_writers: Option<Vec<W>>,
sample_barcode_writers: Option<Vec<W>>,
molecular_barcode_writers: Option<Vec<W>>,
cellular_barcode_writers: Option<Vec<W>>,
}
impl<W: Write> SampleWriters<W> {
#[allow(clippy::type_complexity)]
fn into_parts(
self,
) -> (String, Option<Vec<W>>, Option<Vec<W>>, Option<Vec<W>>, Option<Vec<W>>) {
(
self.name,
self.template_writers,
self.sample_barcode_writers,
self.molecular_barcode_writers,
self.cellular_barcode_writers,
)
}
fn write(&mut self, read_set: &ReadSet, template: &TemplateOutput) -> Result<()> {
if let Some(writers) = &mut self.template_writers {
for (read_idx, (writer, segment)) in
writers.iter_mut().zip(read_set.template_segments()).enumerate()
{
read_set.write_header(writer, read_idx + 1, template.umi_in_header)?;
writer.write_all(b"\n")?;
if template.include_other_segments {
let (seq, quals) =
read_set.segments_for_source(segment.source_index, &template.types);
writer.write_all(&seq)?;
writer.write_all(b"\n+\n")?;
writer.write_all(&quals)?;
} else {
writer.write_all(segment.seq.as_slice())?;
writer.write_all(b"\n+\n")?;
writer.write_all(segment.quals.as_slice())?;
}
writer.write_all(b"\n")?;
}
}
for (writers_opt, segments) in [
(&mut self.sample_barcode_writers, &mut read_set.sample_barcode_segments()),
(&mut self.molecular_barcode_writers, &mut read_set.molecular_barcode_segments()),
(&mut self.cellular_barcode_writers, &mut read_set.cellular_barcode_segments()),
] {
if let Some(writers) = writers_opt {
for (read_idx, (writer, segment)) in writers.iter_mut().zip(segments).enumerate() {
read_set.write_header(writer, read_idx + 1, template.umi_in_header)?;
writer.write_all(b"\n")?;
writer.write_all(segment.seq.as_slice())?;
writer.write_all(b"\n+\n")?;
writer.write_all(segment.quals.as_slice())?;
writer.write_all(b"\n")?;
}
}
}
Ok(())
}
}
impl SampleWriters<PooledWriter> {
fn close(self) -> Result<()> {
for writers in [
self.template_writers,
self.sample_barcode_writers,
self.molecular_barcode_writers,
self.cellular_barcode_writers,
]
.into_iter()
.flatten()
{
writers.into_iter().try_for_each(PooledWriter::close)?;
}
Ok(())
}
}
#[allow(clippy::module_name_repetitions)]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DemuxMetric {
sample_id: String,
barcode: String,
templates: usize,
frac_templates: f64,
ratio_to_mean: f64,
ratio_to_best: f64,
}
impl DemuxMetric {
fn new(sample: &str, barcode: &str) -> DemuxMetric {
DemuxMetric {
sample_id: sample.to_string(),
barcode: barcode.to_string(),
templates: 0,
frac_templates: 0.0,
ratio_to_mean: 0.0,
ratio_to_best: 0.0,
}
}
fn update(samples: &mut [DemuxMetric], unmatched: &mut DemuxMetric) {
let sample_total: f64 = samples.iter().map(|s| s.templates as f64).sum();
let total = sample_total + unmatched.templates as f64;
let mean = sample_total / samples.len() as f64;
let best = samples.iter().map(|s| s.templates).max().unwrap_or(0) as f64;
for sample in samples {
sample.frac_templates = sample.templates as f64 / total;
sample.ratio_to_mean = sample.templates as f64 / mean;
sample.ratio_to_best = sample.templates as f64 / best;
}
unmatched.frac_templates = unmatched.templates as f64 / total;
unmatched.ratio_to_mean = unmatched.templates as f64 / mean;
unmatched.ratio_to_best = unmatched.templates as f64 / best;
}
}
#[derive(Parser, Debug)]
#[command(version)]
#[clap(verbatim_doc_comment)]
pub(crate) struct Demux {
#[clap(long, short = 'i', required = true, num_args = 1..)]
inputs: Vec<PathBuf>,
#[clap(long, short = 'r' , required = true, num_args = 1..)]
read_structures: Vec<ReadStructure>,
#[clap(long, short='b', default_value="T", num_args = 1.. )]
output_types: Vec<char>,
#[clap(long, short = 's', required = true)]
sample_metadata: PathBuf,
#[clap(long, short = 'o', required = true)]
output: PathBuf,
#[clap(long, short = 'u', default_value = "unmatched")]
unmatched_prefix: String,
#[clap(long, default_value = "1")]
max_mismatches: usize,
#[clap(long, short = 'd', default_value = "2")]
min_mismatch_delta: usize,
#[clap(long, short = 't', default_value = "8")]
threads: usize,
#[clap(long, short = 'c', default_value = "5")]
compression_level: usize,
#[clap(long, short = 'S')]
skip_reasons: Vec<SkipReason>,
#[clap(long, default_value = "T", num_args = 1..)]
template_types: Vec<char>,
}
impl Demux {
fn create_sample_writers(
read_structures: &[ReadStructure],
prefix: &str,
output_types: &HashSet<SegmentType>,
output_dir: &Path,
) -> Result<SampleWriters<BufWriter<File>>> {
let mut template_writers = None;
let mut sample_barcode_writers = None;
let mut molecular_barcode_writers = None;
let mut cellular_barcode_writers = None;
for output_type in output_types {
let mut output_type_writers = Vec::new();
let file_type_code = match output_type {
SegmentType::Template => 'R',
SegmentType::SampleBarcode => 'I',
SegmentType::MolecularBarcode => 'U',
SegmentType::CellularBarcode => 'C',
_ => 'S',
};
let segment_count: usize =
read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();
for idx in 1..=segment_count {
output_type_writers.push(BufWriter::new(File::create(
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
)?));
}
match output_type {
SegmentType::Template => template_writers = Some(output_type_writers),
SegmentType::SampleBarcode => {
sample_barcode_writers = Some(output_type_writers);
}
SegmentType::MolecularBarcode => {
molecular_barcode_writers = Some(output_type_writers);
}
SegmentType::CellularBarcode => {
cellular_barcode_writers = Some(output_type_writers);
}
_ => {}
}
}
Ok(SampleWriters {
name: prefix.to_string(),
template_writers,
sample_barcode_writers,
molecular_barcode_writers,
cellular_barcode_writers,
})
}
fn create_writers(
read_structures: &[ReadStructure],
sample_group: &SampleGroup,
output_types: &HashSet<SegmentType>,
unmatched_prefix: &str,
output_dir: &Path,
) -> Result<Vec<SampleWriters<BufWriter<File>>>> {
let mut samples_writers = Vec::with_capacity(sample_group.samples.len());
for sample in &sample_group.samples {
let rs = sample.read_structures.as_deref().unwrap_or(read_structures);
samples_writers.push(Self::create_sample_writers(
rs,
&sample.sample_id,
output_types,
output_dir,
)?);
}
samples_writers.push(Self::create_sample_writers(
read_structures,
unmatched_prefix,
output_types,
output_dir,
)?);
Ok(samples_writers)
}
fn build_writer_pool(
sample_writers: Vec<SampleWriters<BufWriter<File>>>,
compression_level: usize,
threads: usize,
) -> Result<(Pool, Vec<SampleWriters<PooledWriter>>)> {
let mut new_sample_writers = Vec::with_capacity(sample_writers.len());
let mut pool_builder = PoolBuilder::<_, BgzfCompressor>::new()
.threads(threads)
.queue_size(threads * 50)
.compression_level(u8::try_from(compression_level)?)?;
for sample in sample_writers {
let (name, template_writers, barcode_writers, mol_writers, cb_writers) =
sample.into_parts();
let mut new_template_writers = None;
let mut new_sample_barcode_writers = None;
let mut new_molecular_barcode_writers = None;
let mut new_cellular_barcode_writers = None;
for (optional_ws, target) in [
(template_writers, &mut new_template_writers),
(barcode_writers, &mut new_sample_barcode_writers),
(mol_writers, &mut new_molecular_barcode_writers),
(cb_writers, &mut new_cellular_barcode_writers),
] {
if let Some(ws) = optional_ws {
let mut new_writers = Vec::with_capacity(ws.len());
for writer in ws {
new_writers.push(pool_builder.exchange(writer));
}
_ = target.insert(new_writers);
}
}
new_sample_writers.push(SampleWriters {
name,
template_writers: new_template_writers,
sample_barcode_writers: new_sample_barcode_writers,
molecular_barcode_writers: new_molecular_barcode_writers,
cellular_barcode_writers: new_cellular_barcode_writers,
});
}
let pool = pool_builder.build()?;
Ok((pool, new_sample_writers))
}
fn parse_segment_types(
chars: &[char],
error_prefix: &str,
errors: &mut Vec<String>,
) -> Option<HashSet<SegmentType>> {
match chars.iter().map(|&c| SegmentType::try_from(c)).collect::<Result<HashSet<_>, _>>() {
Ok(set) => Some(set),
Err(e) => {
errors.push(format!("{error_prefix}: {e}"));
None
}
}
}
fn validate_and_prepare_inputs(
&self,
) -> Result<(VecOfReaders, HashSet<SegmentType>, HashSet<SegmentType>)> {
let mut constraint_errors = vec![];
if self.inputs.len() != self.read_structures.len() {
let preamble = "The same number of read structures should be given as FASTQs";
let specifics = format!(
"{} read-structures provided for {} FASTQs",
self.read_structures.len(),
self.inputs.len()
);
constraint_errors.push(format!("{preamble} {specifics}"));
}
if !self.output.exists() {
info!("Output directory {:#?} didn't exist, creating it.", self.output);
fs::create_dir_all(&self.output)?;
}
if self.output.metadata()?.permissions().readonly() {
constraint_errors
.push(format!("Ouput directory {:#?} cannot be read-only", self.output));
}
let output_segment_types_result = Self::parse_segment_types(
&self.output_types,
"Error parsing segment types to report",
&mut constraint_errors,
);
let template_segment_types_result = Self::parse_segment_types(
&self.template_types,
"Error parsing template segment types",
&mut constraint_errors,
);
if let (Some(output_types), Some(template_types)) =
(&output_segment_types_result, &template_segment_types_result)
{
if !template_types.contains(&SegmentType::Template) {
constraint_errors
.push("--template-types must include T (template segment)".to_owned());
}
let has_non_template = template_types.iter().any(|t| *t != SegmentType::Template);
if has_non_template && !output_types.contains(&SegmentType::Template) {
constraint_errors.push(
"--template-types with non-T segments requires T in --output-types".to_owned(),
);
}
if has_non_template {
for (idx, rs) in self.read_structures.iter().enumerate() {
let template_count = rs.segments_by_type(SegmentType::Template).count();
if template_count > 1 {
constraint_errors.push(format!(
"--template-types with non-T segments is not supported when a read structure has multiple template segments (read structure #{} has {})",
idx + 1, template_count
));
}
}
for segment_type in template_types.iter().filter(|t| **t != SegmentType::Template) {
let type_char = segment_type.value();
let present_anywhere = self
.read_structures
.iter()
.any(|rs| rs.segments_by_type(*segment_type).count() > 0);
if !present_anywhere {
constraint_errors.push(format!(
"--template-types includes {type_char} but no read structure contains that segment type"
));
continue;
}
for (idx, rs) in self.read_structures.iter().enumerate() {
let has_type = rs.segments_by_type(*segment_type).count() > 0;
let has_template = rs.segments_by_type(SegmentType::Template).count() > 0;
if has_type && !has_template {
constraint_errors.push(format!(
"--template-types includes {type_char} but read structure #{} contains {type_char} with no template (T) segment to merge it into (segments are only merged within the same read); add {type_char} to --output-types, or remove it from --template-types so it is written to the read header instead",
idx + 1
));
}
}
}
}
let cellular_present = self
.read_structures
.iter()
.any(|rs| rs.segments_by_type(SegmentType::CellularBarcode).count() > 0);
if cellular_present
&& !output_types.contains(&SegmentType::CellularBarcode)
&& !template_types.contains(&SegmentType::CellularBarcode)
{
constraint_errors.push(
"Cellular barcode (C) segments are present but assigned to neither --output-types nor --template-types, and have no read-header field; add C to one of them".to_owned(),
);
}
}
for input in &self.inputs {
if !input.exists() {
constraint_errors.push(format!("Provided input file {:#?} doesn't exist", input));
}
}
let fgio = Io::new(5, BUFFER_SIZE);
let fq_readers_result = self
.inputs
.iter()
.map(|p| fgio.new_reader(p))
.collect::<Result<VecOfReaders, fgoxide::FgError>>();
if let Err(e) = &fq_readers_result {
constraint_errors.push(format!("Error opening input files for reading: {}", e));
}
if self.threads < 5 {
constraint_errors
.push(format!("Threads provided {} was too low! Must be 5 or more.", self.threads));
}
if constraint_errors.is_empty() {
let output_segment_types =
output_segment_types_result.expect("output segment types parsed without errors");
let template_segment_types = template_segment_types_result
.expect("template segment types parsed without errors");
if output_segment_types.is_empty() {
constraint_errors.push(
"No output types requested, must request at least one output segment type."
.to_owned(),
);
} else {
return Ok((fq_readers_result?, output_segment_types, template_segment_types));
}
}
let mut details = "Inputs failed validation!\n".to_owned();
for error_reason in constraint_errors {
details.push_str(&format!(" - {}\n", error_reason));
}
Err(anyhow!("The following errors with the input(s) were detected:\n{}", details))
}
}
impl Command for Demux {
#[allow(clippy::too_many_lines)]
fn execute(&self) -> Result<()> {
let (fq_readers, output_segment_types, template_segment_types) =
self.validate_and_prepare_inputs()?;
let template_output = TemplateOutput::from_types(template_segment_types);
let sample_group = SampleGroup::from_file(&self.sample_metadata, &self.read_structures)?;
info!(
"{} samples loaded from file {:?}",
sample_group.samples.len(),
&self.sample_metadata
);
let fq_sources =
fq_readers.into_iter().map(|fq| FastqReader::with_capacity(fq, BUFFER_SIZE));
let reader_threads = if self.threads <= 6 { 1 } else { 2 };
let main_thread = 1;
let writer_threads = self.threads - main_thread - reader_threads;
let (mut pool, mut sample_writers) = Self::build_writer_pool(
Self::create_writers(
&self.read_structures,
&sample_group,
&output_segment_types,
&self.unmatched_prefix,
&self.output,
)?,
self.compression_level,
writer_threads,
)?;
let unmatched_index = sample_writers.len() - 1;
info!("Created sample and {} writers.", self.unmatched_prefix);
let mut sample_metrics = sample_group
.samples
.iter()
.map(|s| DemuxMetric::new(s.sample_id.as_str(), s.barcode.as_str()))
.collect_vec();
let mut unmatched_metric = DemuxMetric::new(self.unmatched_prefix.as_str(), ".");
let per_sample_rs = sample_group.has_per_sample_read_structures();
let prefix_lens = if per_sample_rs {
sample_group.matching_prefix_lens(&self.read_structures)?
} else {
Vec::new()
};
let matching_patterns = if per_sample_rs {
sample_group.build_matching_patterns(&self.read_structures, &prefix_lens)?
} else {
Vec::new()
};
let mut barcode_matcher = if per_sample_rs {
BarcodeMatcher::with_patterns(
&sample_group.samples,
matching_patterns,
u8::try_from(self.max_mismatches)?,
u8::try_from(self.min_mismatch_delta)?,
true,
)
} else {
BarcodeMatcher::new(
&sample_group.samples,
u8::try_from(self.max_mismatches)?,
u8::try_from(self.min_mismatch_delta)?,
true,
)
};
let mut fq_iterators = fq_sources
.zip(self.read_structures.clone())
.enumerate()
.map(|(source_index, (source, read_structure))| {
let min_len = if per_sample_rs {
prefix_lens[source_index]
} else {
read_structure.iter().map(|s| s.length().unwrap_or(1)).sum()
};
ReadSetIterator::new(
read_structure,
source,
self.skip_reasons.clone(),
source_index,
per_sample_rs,
min_len,
)
.read_ahead(1000, 1000)
})
.collect::<Vec<_>>();
let logger = ProgLogBuilder::new()
.name("fqtk")
.noun("records")
.verb("demultiplexed")
.unit(1_000_000)
.count_formatter(CountFormatterKind::Comma)
.level(log::Level::Info)
.build();
let allow_too_few = self.skip_reasons.contains(&SkipReason::TooFewBases);
let mut skip_reasons: HashMap<SkipReason, usize> = HashMap::new();
loop {
let mut next_read_sets = Vec::with_capacity(fq_iterators.len());
for iter in &mut fq_iterators {
if let Some(rec) = iter.next() {
next_read_sets.push(rec);
}
}
if let Some(reason) = next_read_sets.iter().find_map(|read_set| read_set.skip_reason) {
let previous = skip_reasons.get(&reason).unwrap_or(&0);
skip_reasons.insert(reason, 1 + previous);
continue;
} else if next_read_sets.is_empty() {
break;
}
assert_eq!(
next_read_sets.len(),
fq_iterators.len(),
"FASTQ sources out of sync at records: {:?}",
next_read_sets
);
let read_set = ReadSet::combine_readsets(next_read_sets);
let observed: Vec<u8> = if per_sample_rs {
read_set.matching_window_bases(&prefix_lens)
} else {
read_set.sample_barcode_sequence()
};
if let Some(barcode_match) = barcode_matcher.assign(&observed) {
let matched_idx = barcode_match.best_match;
if per_sample_rs {
let rs_for_sample = sample_group.samples[matched_idx]
.read_structures
.as_deref()
.unwrap_or(&self.read_structures);
match read_set.resegment(rs_for_sample) {
Ok(resegmented) => {
sample_writers[matched_idx].write(&resegmented, &template_output)?;
sample_metrics[matched_idx].templates += 1;
}
Err(ResegmentError::TooShort(_)) if allow_too_few => {
*skip_reasons.entry(SkipReason::TooFewBases).or_insert(0) += 1;
}
Err(ResegmentError::TooShort(e)) => {
return Err(e.context(TOO_FEW_BASES_HINT));
}
Err(ResegmentError::Other(e)) => return Err(e),
}
} else {
sample_writers[matched_idx].write(&read_set, &template_output)?;
sample_metrics[matched_idx].templates += 1;
}
} else if per_sample_rs {
match read_set.resegment(&self.read_structures) {
Ok(resegmented) => {
sample_writers[unmatched_index].write(&resegmented, &template_output)?;
unmatched_metric.templates += 1;
}
Err(ResegmentError::TooShort(_)) if allow_too_few => {
*skip_reasons.entry(SkipReason::TooFewBases).or_insert(0) += 1;
}
Err(ResegmentError::TooShort(e)) => return Err(e.context(TOO_FEW_BASES_HINT)),
Err(ResegmentError::Other(e)) => return Err(e),
}
} else {
sample_writers[unmatched_index].write(&read_set, &template_output)?;
unmatched_metric.templates += 1;
}
logger.record();
}
info!("Finished reading input FASTQs.");
sample_writers.into_iter().map(SampleWriters::close).collect::<Result<Vec<_>>>()?;
pool.stop_pool()?;
info!("Output FASTQ writing complete.");
if skip_reasons.is_empty() {
info!("No records were skipped.");
} else {
for (reason, count) in skip_reasons.iter().sorted_by_key(|(_, c)| *c) {
info!("{} records were skipped due to {}", count, reason);
}
}
let metrics_path = self.output.join("demux-metrics.txt");
DemuxMetric::update(sample_metrics.as_mut_slice(), &mut unmatched_metric);
sample_metrics.push(unmatched_metric);
DelimFile::default().write_tsv(&metrics_path, sample_metrics)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use fqtk_lib::samples::Sample;
use rstest::rstest;
use seq_io::fastq::OwnedRecord;
use std::str;
use std::str::FromStr;
use tempfile::TempDir;
const SAMPLE1_BARCODE: &str = "GATTGGG";
fn fq_lines_from_bases(prefix: &str, records_bases: &[&str]) -> Vec<String> {
let mut result = Vec::with_capacity(records_bases.len() * 4);
for (i, &bases) in records_bases.iter().enumerate() {
result.push(format!("@{}_{}", prefix, i));
result.push(bases.to_owned());
result.push("+".to_owned());
result.push(";".repeat(bases.len()));
}
result
}
fn fastq_file(
tmpdir: &TempDir,
filename_prefix: &str,
read_prefix: &str,
records_bases: &[&str],
) -> PathBuf {
let io = Io::default();
let path = tmpdir.path().join(format!("{filename_prefix}.fastq"));
let fastq_lines = fq_lines_from_bases(read_prefix, records_bases);
io.write_lines(&path, fastq_lines).unwrap();
path
}
fn metadata_lines_from_barcodes(barcodes: &[&str]) -> Vec<String> {
let mut result = Vec::with_capacity(barcodes.len() + 1);
result.push(Sample::deserialize_header_line());
for (i, &barcode) in barcodes.iter().enumerate() {
result.push(format!("Sample{:04}\t{}", i, barcode));
}
result
}
fn metadata_file(tmpdir: &TempDir, barcodes: &[&str]) -> PathBuf {
let io = Io::default();
let path = tmpdir.path().join("metadata.tsv");
let metadata_lines = metadata_lines_from_barcodes(barcodes);
io.write_lines(&path, metadata_lines).unwrap();
path
}
fn metadata(tmpdir: &TempDir) -> PathBuf {
metadata_file(tmpdir, &[SAMPLE1_BARCODE])
}
fn read_fastq(file_path: &PathBuf) -> Vec<OwnedRecord> {
let fg_io = Io::default();
FastqReader::new(fg_io.new_reader(file_path).unwrap())
.into_records()
.collect::<Result<Vec<_>, seq_io::fastq::Error>>()
.unwrap()
}
fn assert_equal(actual: &impl seq_io::fastq::Record, expected: &impl seq_io::fastq::Record) {
assert_eq!(
String::from_utf8(actual.head().to_vec()).unwrap(),
String::from_utf8(expected.head().to_vec()).unwrap()
);
assert_eq!(
String::from_utf8(actual.seq().to_vec()).unwrap(),
String::from_utf8(expected.seq().to_vec()).unwrap()
);
assert_eq!(
String::from_utf8(actual.qual().to_vec()).unwrap(),
String::from_utf8(expected.qual().to_vec()).unwrap()
);
}
fn read_set(segments: Vec<FastqSegment>) -> ReadSet {
ReadSet {
header: "NOT_IMPORTANT".as_bytes().to_owned(),
segments,
raw_per_input: vec![],
skip_reason: None,
}
}
#[test]
fn validate_inputs_can_succeed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[rstest]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
#[case(vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
])]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
#[case(vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
])]
fn test_different_number_of_read_structs_and_inputs_fails(
#[case] read_structures: Vec<ReadStructure>,
) {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "cannot be read-only")]
#[allow(clippy::permissions_set_readonly_false)]
fn test_read_only_output_dir_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let mut permissions = tmp.path().metadata().unwrap().permissions();
permissions.set_readonly(true);
fs::set_permissions(tmp.path(), permissions.clone()).unwrap();
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
let demux_result = demux_inputs.execute();
permissions.set_readonly(false);
fs::set_permissions(tmp.path(), permissions).unwrap();
demux_result.unwrap();
}
#[test]
#[should_panic(expected = "doesn't exist")]
fn test_inputs_doesnt_exist_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
tmp.path().join("this_file_does_not_exist.fq"),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 2,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "Threads provided 2 was too low!")]
fn test_too_few_threads_fails() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 2,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
fn test_demux_fragment_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B100T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_template_types_includes_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B100T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let template_seq = "T".repeat(100);
let full_read = s1_barcode.to_owned() + &template_seq;
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &[&full_read])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: full_read.as_bytes().to_vec(),
qual: ";".repeat(117).as_bytes().to_vec(),
},
);
}
#[test]
fn test_template_types_with_paired_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("8B92T").unwrap(),
ReadStructure::from_str("100T").unwrap(),
];
let s1_barcode = "AAAAAAAA";
let r1_full = s1_barcode.to_owned() + &"A".repeat(92);
let r2_full = "T".repeat(100);
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![
fastq_file(&tmp, "ex_R1", "ex", &[&r1_full]),
fastq_file(&tmp, "ex_R2", "ex", &[&r2_full]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'T'],
};
demux_inputs.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA".to_vec(),
seq: r1_full.as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAA".to_vec(),
seq: r2_full.as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_template_types_with_umi() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B8M84T").unwrap()];
let s1_barcode = "AAAAAAAA";
let umi = "GGGGGGGG";
let template = "T".repeat(84);
let full_read = s1_barcode.to_owned() + umi + &template;
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &[&full_read])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['M', 'T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
let expected_seq = umi.to_owned() + &template;
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA".to_vec(),
seq: expected_seq.as_bytes().to_vec(),
qual: ";".repeat(92).as_bytes().to_vec(),
},
);
}
#[test]
#[should_panic(expected = "--template-types with non-T segments requires T in --output-types")]
fn test_template_types_requires_template_in_output_types() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B92T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &["A".repeat(100).as_str()])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['B'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "--template-types must include T (template segment)")]
fn test_template_types_must_include_template() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B8M84T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &["A".repeat(100).as_str()])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'M'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "--template-types includes C but no read structure contains that segment type"
)]
fn test_template_types_absent_type_rejected() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B92T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &["A".repeat(100).as_str()])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['C', 'T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "Error parsing template segment types")]
fn test_template_types_invalid_segment_type() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B92T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &["A".repeat(100).as_str()])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['X', 'T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "--template-types with non-T segments is not supported when a read structure has multiple template segments (read structure #1 has 2)"
)]
fn test_template_types_non_t_rejects_multi_template_read_structure() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B20T20S20T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&(s1_barcode.to_owned() + &"A".repeat(20) + &"C".repeat(20) + &"T".repeat(20))],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "--template-types includes M but read structure #1 contains M with no template (T) segment to merge it into"
)]
fn test_template_types_non_colocated_segment_rejected() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("8M").unwrap(), ReadStructure::from_str("100T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![
fastq_file(&tmp, "r1", "r1", &["GGGGGGGG"]),
fastq_file(&tmp, "r2", "r2", &["A".repeat(100).as_str()]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['M', 'T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "Cellular barcode (C) segments are present but assigned to neither --output-types nor --template-types"
)]
fn test_cellular_barcode_without_destination_rejected() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B7C85T").unwrap()];
let s1_barcode = "AAAAAAAA";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &["A".repeat(100).as_str()])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
fn test_template_types_all_segments_with_skip() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("8B4S8M80T").unwrap()];
let s1_barcode = "AAAAAAAA";
let skip = "NNNN";
let umi = "GGGGGGGG";
let template = "T".repeat(80);
let full_read = s1_barcode.to_owned() + skip + umi + &template;
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(&tmp, "ex", "ex", &[&full_read])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['B', 'S', 'M', 'T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA".to_vec(),
seq: full_read.as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_template_types_with_cellular_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("10M8B7C75T").unwrap()];
let s1_barcode = "AAAAAAAA";
let s1_umi = "ATCGATCGAT";
let s1_cellular_barcode = "GATTACA";
let template = "T".repeat(75);
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&format!("{s1_umi}{s1_barcode}{s1_cellular_barcode}{template}")],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['C', 'T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
let expected_seq = s1_cellular_barcode.to_owned() + &template;
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: expected_seq.as_bytes().to_vec(),
qual: ";".repeat(82).as_bytes().to_vec(),
},
);
}
#[test]
fn test_output_type_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("10M8B7C100T").unwrap()];
let s1_barcode = "AAAAAAAA";
let s1_umi = "ATCGATCGAT";
let s1_cellular_barcode = "GATTACA";
let sample_metadata =
metadata_file(&tmp, &[s1_barcode, "CCCCCCCC", "GGGGGGGG", "TTTTTTTT"]);
let template_sequence = "A".repeat(100);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&format!("{s1_umi}{s1_barcode}{s1_cellular_barcode}{template_sequence}")],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B', 'M', 'C'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let barcode_output_path = output_dir.join("Sample0000.I1.fq.gz");
let umi_output_path = output_dir.join("Sample0000.U1.fq.gz");
let cellular_barcode_output_path = output_dir.join("Sample0000.C1.fq.gz");
let fq_reads = read_fastq(&output_path);
let barcode_fq_reads = read_fastq(&barcode_output_path);
let umi_fq_reads = read_fastq(&umi_output_path);
let cellular_barcode_fq_reads = read_fastq(&cellular_barcode_output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
assert_eq!(barcode_fq_reads.len(), 1);
assert_equal(
&barcode_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"AAAAAAAA".to_vec(),
qual: ";".repeat(8).as_bytes().to_vec(),
},
);
assert_eq!(umi_fq_reads.len(), 1);
assert_equal(
&umi_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"ATCGATCGAT".to_vec(),
qual: ";".repeat(10).as_bytes().to_vec(),
},
);
assert_eq!(cellular_barcode_fq_reads.len(), 1);
assert_equal(
&cellular_barcode_fq_reads[0],
&OwnedRecord {
head: b"ex_0:ATCGATCGAT 1:N:0:AAAAAAAA".to_vec(),
seq: b"GATTACA".to_vec(),
qual: ";".repeat(7).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_with_catchall_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNNNNNN";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 0);
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:NNNNNNN".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_with_iupac_bases_in_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "MMMMMMM";
let s2_barcode = "KKKKKKK";
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[
&("AAAAAAA".to_owned() + &"A".repeat(5)), &("CCCCCCC".to_owned() + &"A".repeat(5)), &("ACACACA".to_owned() + &"A".repeat(5)), &("GTGTGTG".to_owned() + &"C".repeat(5)), &("TGTGTGT".to_owned() + &"C".repeat(5)), &("CGCGCGC".to_owned() + &"T".repeat(5)), ],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 0,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 3);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAA".to_vec(),
seq: "A".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let output_path = output_dir.join("Sample0001.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 2);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_3 1:N:0:GTGTGTG".to_vec(),
seq: "C".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 1);
assert_equal(
&unmatched_reads[0],
&OwnedRecord {
head: b"ex_5 1:N:0:CGCGCGC".to_vec(),
seq: "T".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_with_ns_in_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNAAAAA";
let s2_barcode = "NNCCCCC";
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[
&("ANAAAAA".to_owned() + &"A".repeat(5)),
&("ANCCCCC".to_owned() + &"C".repeat(5)),
&("NNNAAAA".to_owned() + &"T".repeat(5)),
],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 0,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:ANAAAAA".to_vec(),
seq: "A".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let output_path = output_dir.join("Sample0001.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_1 1:N:0:ANCCCCC".to_vec(),
seq: "C".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 1);
assert_equal(
&unmatched_reads[0],
&OwnedRecord {
head: b"ex_2 1:N:0:NNNAAAA".to_vec(),
seq: "T".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_paired_reads_with_in_line_sample_barcodes() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("8B100T").unwrap(),
ReadStructure::from_str("9B100T").unwrap(),
];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "ex_R1", "ex", &[&(s1_barcode[..8].to_owned() + &"A".repeat(100))]),
fastq_file(&tmp, "ex_R2", "ex", &[&(s1_barcode[8..].to_owned() + &"T".repeat(100))]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "T".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_dual_indexed_paired_end_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("8B").unwrap(),
ReadStructure::from_str("100T").unwrap(),
ReadStructure::from_str("100T").unwrap(),
ReadStructure::from_str("9B").unwrap(),
];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "ex_I1", "ex", &[&s1_barcode[..8]]),
fastq_file(&tmp, "ex_R1", "ex", &[&"A".repeat(100)]),
fastq_file(&tmp, "ex_R2", "ex", &[&"T".repeat(100)]),
fastq_file(&tmp, "ex_I2", "ex", &[&s1_barcode[8..]]),
];
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAA+GATTACAGA".to_vec(),
seq: "T".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_a_wierd_set_of_reads() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![
ReadStructure::from_str("4B4M8S").unwrap(),
ReadStructure::from_str("4B100T").unwrap(),
ReadStructure::from_str("100S3B").unwrap(),
ReadStructure::from_str("6B1S1M1T").unwrap(),
];
let sample1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[sample1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![
fastq_file(&tmp, "example_1", "ex", &["AAAACCCCGGGGTTTT"]),
fastq_file(&tmp, "example_2", "ex", &[&"A".repeat(104)]),
fastq_file(&tmp, "example_3", "ex", &[&("T".repeat(100) + "GAT")]),
fastq_file(&tmp, "example_4", "ex", &["TACAGAAAT"]),
];
let output_dir = tmp.path().to_path_buf().join("output");
let demux = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0:CCCC+A 1:N:0:AAAA+AAAA+GAT+TACAGA".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0:CCCC+A 2:N:0:AAAA+AAAA+GAT+TACAGA".to_vec(),
seq: "T".as_bytes().to_vec(),
qual: ";".as_bytes().to_vec(),
},
);
}
#[test]
fn test_demux_a_read_structure_with_multiple_templates_in_one_read() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B20T20S20T20S20T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[&(s1_barcode.to_owned()
+ &"A".repeat(20)
+ &"C".repeat(20)
+ &"T".repeat(20)
+ &"C".repeat(20)
+ &"G".repeat(20))],
)];
let output_dir = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 1);
assert_equal(
&r1_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "A".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
let r2_path = output_dir.join("Sample0000.R2.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 1);
assert_equal(
&r2_reads[0],
&OwnedRecord {
head: b"ex_0 2:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "T".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
let r3_path = output_dir.join("Sample0000.R3.fq.gz");
let r3_reads = read_fastq(&r3_path);
assert_eq!(r3_reads.len(), 1);
assert_equal(
&r3_reads[0],
&OwnedRecord {
head: b"ex_0 3:N:0:AAAAAAAAGATTACAGA".to_vec(),
seq: "G".repeat(20).as_bytes().to_vec(),
qual: ";".repeat(20).as_bytes().to_vec(),
},
);
}
#[test]
#[should_panic(
expected = "No output types requested, must request at least one output segment type."
)]
fn test_fails_if_zero_read_structures_have_template_bases() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+M").unwrap(),
ReadStructure::from_str("+M").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec![],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
fn test_fails_if_not_enough_fastq_records_are_passed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+T").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(expected = "The same number of read structures should be given as FASTQs")]
fn test_fails_if_too_many_fastq_records_are_passed() {
let tmp = TempDir::new().unwrap();
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &["GATTACA"]),
fastq_file(&tmp, "index1", "ex", &[&SAMPLE1_BARCODE[0..3]]),
fastq_file(&tmp, "index2", "ex", &[&SAMPLE1_BARCODE[3..]]),
fastq_file(&tmp, "read2", "ex", &["TAGGATTA"]),
];
let sample_metadata = metadata(&tmp);
let read_structures = vec![
ReadStructure::from_str("+T").unwrap(),
ReadStructure::from_str("+B").unwrap(),
ReadStructure::from_str("+B").unwrap(),
];
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: tmp.path().to_path_buf(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
#[should_panic(
expected = "Read ex_2 had too few bases to demux 0 vs. 1 needed in read structure +T."
)]
fn test_fails_if_reads_too_short() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
let records = [
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], vec!["CCCCCCC", SAMPLE1_BARCODE], vec!["", SAMPLE1_BARCODE], vec!["G", SAMPLE1_BARCODE], ];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &[records[0][0], records[1][0], records[2][0]]),
fastq_file(&tmp, "index1", "ex", &[records[0][1], records[1][1], records[2][1]]),
];
let sample_metadata = metadata(&tmp);
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B'],
output: output_dir,
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
}
#[test]
fn test_skip_reads_too_short() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
let records = [
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], vec!["CCCCCCC", SAMPLE1_BARCODE], vec!["", SAMPLE1_BARCODE], vec!["G", SAMPLE1_BARCODE], ];
let input_files = vec![
fastq_file(&tmp, "read1", "ex", &[records[0][0], records[1][0], records[2][0]]),
fastq_file(&tmp, "index1", "ex", &[records[0][1], records[1][1], records[2][1]]),
];
let sample_metadata = metadata(&tmp);
let output_dir: PathBuf = tmp.path().to_path_buf().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T', 'B'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![SkipReason::TooFewBases],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let metrics_path = output_dir.join("demux-metrics.txt");
let metrics: Vec<DemuxMetric> = DelimFile::default().read_tsv(&metrics_path).unwrap();
let demuxed_reads: usize = metrics.iter().map(|m| m.templates).sum();
let sample0000_reads =
metrics.iter().find(|m| m.sample_id == "Sample0000").unwrap().templates;
assert_eq!(demuxed_reads, 2);
assert_eq!(sample0000_reads, 2);
let r1_path = output_dir.join("Sample0000.R1.fq.gz");
let r1_reads = read_fastq(&r1_path);
assert_eq!(r1_reads.len(), 2);
let r2_path = output_dir.join("Sample0000.I1.fq.gz");
let r2_reads = read_fastq(&r2_path);
assert_eq!(r2_reads.len(), 2);
}
fn seg(bases: &[u8], segment_type: SegmentType) -> FastqSegment {
seg_with_source(bases, segment_type, 0)
}
fn seg_with_source(
bases: &[u8],
segment_type: SegmentType,
source_index: usize,
) -> FastqSegment {
let quals = vec![b'#'; bases.len()];
FastqSegment { seq: bases.to_vec(), quals, segment_type, source_index }
}
#[test]
fn test_write_header_standard_no_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108 1:N:0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [];
let expected = "@inst:123:ABCDE:1:204:1022:2108 1:N:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_standard_with_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108 1:Y:0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@inst:123:ABCDE:1:204:1022:2108:AACCGGTT 2:Y:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
2,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_omits_umi_when_in_template_bases() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108 1:Y:0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@inst:123:ABCDE:1:204:1022:2108 2:Y:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
2,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
false,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_append_barcode_and_umi() {
let mut out = Vec::new();
let header = b"inst:123:ABCDE:1:204:1022:2108:AAAA 1:Y:0:TTTT";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected =
"@inst:123:ABCDE:1:204:1022:2108:AAAA+AACCGGTT 2:Y:0:TTTT+ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
2,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_write_header_short_name_no_comment() {
let mut out = Vec::new();
let header = b"q1";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@q1:AACCGGTT 1:N:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
#[should_panic(expected = "8 segments")]
fn test_write_header_name_too_many_parts() {
let mut out = Vec::new();
let header = b"q1:1:2:3:4:5:6:7:8:9:10";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
}
#[test]
fn test_write_header_comment_too_few_parts() {
let mut out = Vec::new();
let header = b"q1 0:0";
let barcode_segs =
[seg(b"ACGT", SegmentType::SampleBarcode), seg(b"GGTT", SegmentType::SampleBarcode)];
let umi_segs = [seg(b"AACCGGTT", SegmentType::MolecularBarcode)];
let expected = "@q1:AACCGGTT 0:0:ACGT+GGTT".to_string();
ReadSet::write_header_internal(
&mut out,
1,
header,
barcode_segs.iter().filter(|_| true),
umi_segs.iter().filter(|_| true),
true,
)
.unwrap();
assert_eq!(String::from_utf8(out).unwrap(), expected);
}
#[test]
fn test_sample_barcode_sequence() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(read_set.sample_barcode_sequence(), "GATACAC".as_bytes().to_owned());
}
#[test]
fn test_cellular_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
seg("ACGTACGT".as_bytes(), SegmentType::CellularBarcode),
];
let read_set = read_set(segments);
let expected = vec![seg("ACGTACGT".as_bytes(), SegmentType::CellularBarcode)];
assert_eq!(expected, read_set.cellular_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_template_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::SampleBarcode),
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::Template),
seg("CAC".as_bytes(), SegmentType::Template),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.template_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_sample_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
seg("GACCCC".as_bytes(), SegmentType::MolecularBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::SampleBarcode),
seg("CAC".as_bytes(), SegmentType::SampleBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.sample_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_molecular_barcode_segments() {
let segments = vec![
seg("AGCT".as_bytes(), SegmentType::Template),
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
seg("GACCCC".as_bytes(), SegmentType::SampleBarcode),
];
let expected = vec![
seg("GATA".as_bytes(), SegmentType::MolecularBarcode),
seg("CAC".as_bytes(), SegmentType::MolecularBarcode),
];
let read_set = read_set(segments);
assert_eq!(expected, read_set.molecular_barcode_segments().cloned().collect::<Vec<_>>());
}
#[test]
fn test_combine_readsets() {
let segments1 = vec![
seg("A".as_bytes(), SegmentType::Template),
seg("G".as_bytes(), SegmentType::Template),
seg("C".as_bytes(), SegmentType::MolecularBarcode),
seg("T".as_bytes(), SegmentType::SampleBarcode),
];
let read_set1 = read_set(segments1.clone());
let segments2 = vec![
seg("AA".as_bytes(), SegmentType::Template),
seg("AG".as_bytes(), SegmentType::Template),
seg("AC".as_bytes(), SegmentType::MolecularBarcode),
seg("AT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set2 = read_set(segments2.clone());
let segments3 = vec![
seg("AAA".as_bytes(), SegmentType::Template),
seg("AAG".as_bytes(), SegmentType::Template),
seg("AAC".as_bytes(), SegmentType::MolecularBarcode),
seg("AAT".as_bytes(), SegmentType::SampleBarcode),
];
let read_set3 = read_set(segments3.clone());
let mut expected_segments = Vec::new();
expected_segments.extend(segments1);
expected_segments.extend(segments2);
expected_segments.extend(segments3);
let expected = read_set(expected_segments);
let result = ReadSet::combine_readsets(vec![read_set1, read_set2, read_set3]);
assert_eq!(result, expected);
}
#[test]
#[should_panic(expected = "Cannot call combine readsets on an empty vec!")]
fn test_combine_readsets_fails_on_empty_vector() {
let _result = ReadSet::combine_readsets(Vec::new());
}
#[test]
fn test_segments_for_source() {
let segments = vec![
seg_with_source(b"AAAA", SegmentType::SampleBarcode, 0),
seg_with_source(b"GGGGG", SegmentType::Template, 0),
seg_with_source(b"CC", SegmentType::MolecularBarcode, 1),
seg_with_source(b"TTT", SegmentType::Template, 1),
seg_with_source(b"NN", SegmentType::Skip, 0),
];
let rs = read_set(segments);
let b_t: HashSet<SegmentType> =
[SegmentType::SampleBarcode, SegmentType::Template].into_iter().collect();
let m_t: HashSet<SegmentType> =
[SegmentType::MolecularBarcode, SegmentType::Template].into_iter().collect();
let t_only: HashSet<SegmentType> = [SegmentType::Template].into_iter().collect();
let (seq, quals) = rs.segments_for_source(0, &b_t);
assert_eq!(seq, b"AAAAGGGGG");
assert_eq!(quals.len(), seq.len());
let (seq, _) = rs.segments_for_source(1, &m_t);
assert_eq!(seq, b"CCTTT");
let (seq, _) = rs.segments_for_source(0, &t_only);
assert_eq!(seq, b"GGGGG");
let (seq, _) = rs.segments_for_source(0, &m_t);
assert_eq!(seq, b"GGGGG");
let (seq, quals) = rs.segments_for_source(2, &b_t);
assert!(seq.is_empty());
assert!(quals.is_empty());
}
fn metadata_file_with_rs(tmpdir: &TempDir, samples: &[(&str, &str, &[&str])]) -> PathBuf {
let n_inputs = samples[0].2.len();
let mut header = String::from("sample_id\tbarcode");
for i in 1..=n_inputs {
header.push('\t');
header.push_str(&format!("read_structure_{i}"));
}
let mut lines = vec![header];
for (sid, bc, structs) in samples {
assert_eq!(structs.len(), n_inputs);
let mut line = format!("{sid}\t{bc}");
for s in structs.iter() {
line.push('\t');
line.push_str(s);
}
lines.push(line);
}
let path = tmpdir.path().join("metadata.tsv");
Io::default().write_lines(&path, lines).unwrap();
path
}
#[test]
fn test_demux_codec_per_sample_read_structures_paired_end() {
let tmp = TempDir::new().unwrap();
let read_structures =
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("+T").unwrap()];
let s1_template_r1 = "A".repeat(50);
let s1_template_r2 = "C".repeat(50);
let s1_r1 = format!("AAA{}T{}", "GATTACA", s1_template_r1);
let s1_r2 = format!("CCC{}T{}", "GATTACA", s1_template_r2);
let s2_template_r1 = "G".repeat(50);
let s2_template_r2 = "T".repeat(50);
let s2_r1 = format!("GGGA{}T{}", "TTTTTTT", s2_template_r1);
let s2_r2 = format!("TTTA{}T{}", "TTTTTTT", s2_template_r2);
let r1_file = fastq_file(&tmp, "r1", "ex", &[s1_r1.as_str(), s2_r1.as_str()]);
let r2_file = fastq_file(&tmp, "r2", "ex", &[s1_r2.as_str(), s2_r2.as_str()]);
let sample_metadata = metadata_file_with_rs(
&tmp,
&[
("S1", "GATTACAGATTACA", &["3M7B1S+T", "3M7B1S+T"]),
("S2", "TTTTTTTTTTTTTT", &["3M1S7B1S+T", "3M1S7B1S+T"]),
],
);
let output_dir = tmp.path().join("output");
let demux_inputs = Demux {
inputs: vec![r1_file, r2_file],
read_structures,
sample_metadata,
output_types: vec!['T', 'B', 'M'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let s1_r1_reads = read_fastq(&output_dir.join("S1.R1.fq.gz"));
let s1_r2_reads = read_fastq(&output_dir.join("S1.R2.fq.gz"));
let s1_b1_reads = read_fastq(&output_dir.join("S1.I1.fq.gz"));
let s1_b2_reads = read_fastq(&output_dir.join("S1.I2.fq.gz"));
let s1_m1_reads = read_fastq(&output_dir.join("S1.U1.fq.gz"));
let s1_m2_reads = read_fastq(&output_dir.join("S1.U2.fq.gz"));
assert_eq!(s1_r1_reads.len(), 1, "S1 R1 should have one read");
assert_eq!(s1_r1_reads[0].seq, s1_template_r1.as_bytes());
assert_eq!(s1_r2_reads[0].seq, s1_template_r2.as_bytes());
assert_eq!(s1_b1_reads[0].seq, b"GATTACA");
assert_eq!(s1_b2_reads[0].seq, b"GATTACA");
assert_eq!(s1_m1_reads[0].seq, b"AAA");
assert_eq!(s1_m2_reads[0].seq, b"CCC");
let s2_r1_reads = read_fastq(&output_dir.join("S2.R1.fq.gz"));
let s2_r2_reads = read_fastq(&output_dir.join("S2.R2.fq.gz"));
let s2_b1_reads = read_fastq(&output_dir.join("S2.I1.fq.gz"));
let s2_b2_reads = read_fastq(&output_dir.join("S2.I2.fq.gz"));
let s2_m1_reads = read_fastq(&output_dir.join("S2.U1.fq.gz"));
let s2_m2_reads = read_fastq(&output_dir.join("S2.U2.fq.gz"));
assert_eq!(s2_r1_reads.len(), 1, "S2 R1 should have one read");
assert_eq!(s2_r1_reads[0].seq, s2_template_r1.as_bytes());
assert_eq!(s2_r2_reads[0].seq, s2_template_r2.as_bytes());
assert_eq!(s2_b1_reads[0].seq, b"TTTTTTT");
assert_eq!(s2_b2_reads[0].seq, b"TTTTTTT");
assert_eq!(s2_m1_reads[0].seq, b"GGG");
assert_eq!(s2_m2_reads[0].seq, b"TTT");
let unmatched_r1 = read_fastq(&output_dir.join("unmatched.R1.fq.gz"));
let unmatched_r2 = read_fastq(&output_dir.join("unmatched.R2.fq.gz"));
assert!(unmatched_r1.is_empty(), "unmatched R1 should be empty");
assert!(unmatched_r2.is_empty(), "unmatched R2 should be empty");
}
#[test]
fn test_demux_codec_per_sample_single_input() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("+T").unwrap()];
let s1_template = "A".repeat(30);
let s2_template = "C".repeat(30);
let s1_r1 = format!("AAA{}T{}", "GATTACA", s1_template);
let s2_r1 = format!("GGGA{}T{}", "TTTTTTT", s2_template);
let r1_file = fastq_file(&tmp, "r1", "ex", &[s1_r1.as_str(), s2_r1.as_str()]);
let sample_metadata = metadata_file_with_rs(
&tmp,
&[("S1", "GATTACA", &["3M7B1S+T"]), ("S2", "TTTTTTT", &["3M1S7B1S+T"])],
);
let output_dir = tmp.path().join("output");
let demux_inputs = Demux {
inputs: vec![r1_file],
read_structures,
sample_metadata,
output_types: vec!['T', 'B', 'M'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let s1_r1_reads = read_fastq(&output_dir.join("S1.R1.fq.gz"));
assert_eq!(s1_r1_reads.len(), 1, "S1 should have one read");
assert_eq!(s1_r1_reads[0].seq, s1_template.as_bytes(), "S1 template must not be haircut");
assert_eq!(read_fastq(&output_dir.join("S1.I1.fq.gz"))[0].seq, b"GATTACA");
assert_eq!(read_fastq(&output_dir.join("S1.U1.fq.gz"))[0].seq, b"AAA");
let s2_r1_reads = read_fastq(&output_dir.join("S2.R1.fq.gz"));
assert_eq!(s2_r1_reads.len(), 1, "S2 should have one read");
assert_eq!(s2_r1_reads[0].seq, s2_template.as_bytes());
assert_eq!(read_fastq(&output_dir.join("S2.I1.fq.gz"))[0].seq, b"TTTTTTT");
assert_eq!(read_fastq(&output_dir.join("S2.U1.fq.gz"))[0].seq, b"GGG");
}
#[test]
fn test_demux_falls_back_to_global_read_structures() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("17B100T").unwrap()];
let s1_barcode = "AAAAAAAAGATTACAGA";
let sample_metadata = metadata_file(
&tmp,
&[s1_barcode, "CCCCCCCCGATTACAGA", "GGGGGGGGGATTACAGA", "GGGGGGTTGATTACAGA"],
);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
let output_dir = tmp.path().join("output");
let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let fq_reads = read_fastq(&output_dir.join("Sample0000.R1.fq.gz"));
assert_eq!(fq_reads.len(), 1);
assert_eq!(fq_reads[0].seq, "A".repeat(100).as_bytes());
}
#[test]
fn test_demux_errors_when_per_sample_input_count_mismatches_global() {
let tmp = TempDir::new().unwrap();
let r1_file =
fastq_file(&tmp, "r1", "ex", &[format!("AAAGATTACAT{}", "A".repeat(50)).as_str()]);
let sample_metadata = metadata_file_with_rs(&tmp, &[("S1", "GATTACA", &["3M7B1S+T"])]);
let demux_inputs = Demux {
inputs: vec![r1_file.clone(), r1_file],
read_structures: vec![
ReadStructure::from_str("3M7B+T").unwrap(),
ReadStructure::from_str("3M7B+T").unwrap(),
],
sample_metadata,
output_types: vec!['T'],
output: tmp.path().join("output"),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
let err = demux_inputs.execute().unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("`read_structure_<n>` column(s)") && msg.contains("--read-structures"),
"got: {msg}",
);
}
#[test]
fn test_demux_per_sample_signature_may_differ_from_global() {
let tmp = TempDir::new().unwrap();
let r1 = format!("AAAGATTACAT{}", "A".repeat(50));
let r1_file = fastq_file(&tmp, "r1", "ex", &[r1.as_str()]);
let sample_metadata = metadata_file_with_rs(&tmp, &[("S1", "GATTACA", &["3M7B1S+T"])]);
let demux_inputs = Demux {
inputs: vec![r1_file],
read_structures: vec![ReadStructure::from_str("7B+T").unwrap()],
sample_metadata,
output_types: vec!['T', 'M'],
output: tmp.path().join("output").clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
template_types: vec!['T'],
};
let output_dir = demux_inputs.output.clone();
demux_inputs.execute().unwrap();
let s1_r1 = read_fastq(&output_dir.join("S1.R1.fq.gz"));
let s1_u1 = read_fastq(&output_dir.join("S1.U1.fq.gz"));
assert_eq!(s1_r1.len(), 1);
assert_eq!(s1_u1.len(), 1);
assert_eq!(s1_u1[0].seq, b"AAA");
}
fn unmatched_short_read_demux(tmp: &TempDir, skip_reasons: Vec<SkipReason>) -> Demux {
let read_structures = vec![
ReadStructure::from_str("8B+T").unwrap(),
ReadStructure::from_str("8B+T").unwrap(),
];
let sample_metadata = metadata_file_with_rs(tmp, &[("S1", "AAAAAA", &["3B+T", "3B+T"])]);
let r1_file = fastq_file(tmp, "r1", "ex", &["TTT"]);
let r2_file = fastq_file(tmp, "r2", "ex", &["TTT"]);
Demux {
inputs: vec![r1_file, r2_file],
read_structures,
sample_metadata,
output_types: vec!['T', 'B'],
output: tmp.path().join("output"),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons,
template_types: vec!['T'],
}
}
#[test]
fn test_demux_per_sample_unmatched_short_read_aborts_without_flag() {
let tmp = TempDir::new().unwrap();
let demux_inputs = unmatched_short_read_demux(&tmp, vec![]);
let err = demux_inputs.execute().unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("--skip-reasons too-few-bases"), "got: {msg}");
}
#[test]
fn test_demux_per_sample_unmatched_short_read_skipped_with_flag() {
let tmp = TempDir::new().unwrap();
let demux_inputs = unmatched_short_read_demux(&tmp, vec![SkipReason::TooFewBases]);
let output_dir = demux_inputs.output.clone();
demux_inputs.execute().unwrap();
let metrics_path = output_dir.join("demux-metrics.txt");
let metrics: Vec<DemuxMetric> = DelimFile::default().read_tsv(&metrics_path).unwrap();
let demuxed: usize = metrics.iter().map(|m| m.templates).sum();
assert_eq!(demuxed, 0, "no reads should be demuxed; the unmatched read is too short");
}
#[test]
fn test_resegment_zero_length_template_is_too_short() {
let read_set = ReadSet {
header: b"ex".to_vec(),
segments: vec![],
raw_per_input: vec![RawInput {
bases: b"ACGTACGT".to_vec(),
quals: b"FFFFFFFF".to_vec(),
}],
skip_reason: None,
};
let read_structures = vec![ReadStructure::from_str("8B+T").unwrap()];
let result = read_set.resegment(&read_structures);
assert!(
matches!(result, Err(ResegmentError::TooShort(_))),
"expected TooShort for a read with no template bases, got {result:?}",
);
}
#[test]
fn test_resegment_one_base_template_is_ok() {
let read_set = ReadSet {
header: b"ex".to_vec(),
segments: vec![],
raw_per_input: vec![RawInput {
bases: b"ACGTACGTT".to_vec(),
quals: b"FFFFFFFFF".to_vec(),
}],
skip_reason: None,
};
let read_structures = vec![ReadStructure::from_str("8B+T").unwrap()];
let resegmented = read_set.resegment(&read_structures).expect("should resegment");
let template: Vec<u8> =
resegmented.template_segments().flat_map(|s| s.seq.clone()).collect();
assert_eq!(template, b"T");
}
#[test]
fn test_demux_per_sample_zero_length_template_is_skipped() {
let tmp = TempDir::new().unwrap();
let sample_metadata = metadata_file_with_rs(&tmp, &[("S1", "ACGTACGT", &["8B+T"])]);
let r1_file = fastq_file(&tmp, "r1", "ex", &["ACGTACGT"]);
let output_dir = tmp.path().join("output");
let demux_inputs = Demux {
inputs: vec![r1_file],
read_structures: vec![ReadStructure::from_str("8B+T").unwrap()],
sample_metadata,
output_types: vec!['T', 'B'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 1,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![SkipReason::TooFewBases],
template_types: vec!['T'],
};
demux_inputs.execute().unwrap();
let s1_r1 = read_fastq(&output_dir.join("S1.R1.fq.gz"));
assert!(s1_r1.is_empty(), "no zero-length template record should be written");
let metrics_path = output_dir.join("demux-metrics.txt");
let metrics: Vec<DemuxMetric> = DelimFile::default().read_tsv(&metrics_path).unwrap();
let demuxed: usize = metrics.iter().map(|m| m.templates).sum();
assert_eq!(demuxed, 0, "the boundary read must be skipped, not demuxed");
}
}