use crate::is_valid_iupac;
use anyhow::{Context, Result, anyhow, bail, ensure};
use fgoxide::io::Io;
use itertools::Itertools;
use read_structure::{ReadStructure, SegmentType};
use std::fmt::{self, Display};
use std::path::Path;
use std::str::FromStr;
const DEFAULT_FILE_DELIMETER: u8 = b'\t';
const SAMPLE_ID_HEADER: &str = "sample_id";
const BARCODE_HEADER: &str = "barcode";
const READ_STRUCTURE_PREFIX: &str = "read_structure_";
#[derive(Clone, Debug, PartialEq)]
pub struct Sample {
pub sample_id: String,
pub barcode: String,
pub read_structures: Option<Vec<ReadStructure>>,
pub(crate) ordinal: usize,
}
impl Display for Sample {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Sample({:04}) - {{ name: {}\tbarcode: {} }}",
self.ordinal, self.sample_id, self.barcode
)
}
}
impl Sample {
#[must_use]
pub fn new(ordinal: usize, name: String, barcode: String) -> Self {
Self::with_read_structures(ordinal, name, barcode, None)
}
#[must_use]
pub fn with_read_structures(
ordinal: usize,
name: String,
barcode: String,
read_structures: Option<Vec<ReadStructure>>,
) -> Self {
assert!(!name.is_empty(), "Sample name cannot be empty");
assert!(!barcode.is_empty(), "Sample barcode cannot be empty");
assert!(
barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
"All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
);
Self { sample_id: name, barcode, read_structures, ordinal }
}
#[must_use]
pub fn deserialize_header_line() -> String {
format!("{SAMPLE_ID_HEADER}\t{BARCODE_HEADER}")
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct SampleGroup {
pub samples: Vec<Sample>,
}
impl Display for SampleGroup {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "SampleGroup {{")?;
for sample in &self.samples {
writeln!(f, " {sample}")?;
}
writeln!(f, "}}")
}
}
impl SampleGroup {
pub fn from_samples(samples: &[Sample]) -> Result<Self> {
ensure!(!samples.is_empty(), "Must provide one or more sample");
ensure!(
samples.iter().map(|s| &s.sample_id).all_unique(),
"Each sample name must be unique, duplicate identified"
);
ensure!(
samples.iter().map(|s| &s.barcode).all_unique(),
"Each sample barcode must be unique, duplicate identified",
);
let first_barcode_length = samples[0].barcode.len();
ensure!(
samples.iter().map(|s| &s.barcode).all(|b| b.len() == first_barcode_length),
"All barcodes must have the same length",
);
for sample in samples {
let Some(rs) = sample.read_structures.as_ref() else { continue };
let mut b_len: usize = 0;
for seg in rs.iter().flat_map(|r| r.segments_by_type(SegmentType::SampleBarcode)) {
let len = seg.length.ok_or_else(|| {
anyhow!(
"Sample {}: sample-barcode (B) segments in per-sample read structures \
must be fixed length",
sample.sample_id,
)
})?;
b_len += len;
}
ensure!(
b_len == sample.barcode.len(),
"Sample {}: total sample-barcode (B) length across per-sample read structures \
is {} but barcode column has {} bases",
sample.sample_id,
b_len,
sample.barcode.len(),
);
}
Ok(Self {
samples: samples
.iter()
.enumerate()
.map(|(ordinal, sample)| {
Sample::with_read_structures(
ordinal,
sample.sample_id.clone(),
sample.barcode.clone(),
sample.read_structures.clone(),
)
})
.collect(),
})
}
pub fn from_file<P: AsRef<Path>>(path: P, globals: &[ReadStructure]) -> Result<SampleGroup> {
let path = path.as_ref();
let io = Io::default();
let lines = io
.read_lines(path)
.with_context(|| format!("failed to read sample metadata file {path:?}"))?;
let mut iter = lines.into_iter().filter(|l| !l.trim().is_empty());
let header = iter.next().ok_or_else(|| {
anyhow!(
"sample metadata file {path:?} is empty (expected header line {})",
Sample::deserialize_header_line(),
)
})?;
let header = strip_bom_and_cr(&header);
let header_fields: Vec<&str> = header.split(DEFAULT_FILE_DELIMETER as char).collect();
let sample_id_idx =
header_fields.iter().position(|c| *c == SAMPLE_ID_HEADER).ok_or_else(|| {
anyhow!("sample metadata header is missing column `{SAMPLE_ID_HEADER}`")
})?;
let barcode_idx =
header_fields.iter().position(|c| *c == BARCODE_HEADER).ok_or_else(|| {
anyhow!("sample metadata header is missing column `{BARCODE_HEADER}`")
})?;
let mut rs_columns: Vec<(usize, usize)> = Vec::new(); for (idx, name) in header_fields.iter().enumerate() {
if let Some(suffix) = name.strip_prefix(READ_STRUCTURE_PREFIX) {
let n: usize = suffix
.parse()
.with_context(|| format!("metadata column `{name}` has non-integer suffix"))?;
ensure!(n >= 1, "metadata column `{name}` must use 1-based indexing");
rs_columns.push((n, idx));
}
}
rs_columns.sort_by_key(|(n, _)| *n);
for (i, (n, _)) in rs_columns.iter().enumerate() {
ensure!(
*n == i + 1,
"per-sample read structure columns must be contiguous starting at \
`{READ_STRUCTURE_PREFIX}1` (found `{READ_STRUCTURE_PREFIX}{n}` at position {})",
i + 1,
);
}
if !rs_columns.is_empty() {
ensure!(
rs_columns.len() == globals.len(),
"metadata file has {} `{READ_STRUCTURE_PREFIX}<n>` column(s) but \
`--read-structures` has {} entry/entries",
rs_columns.len(),
globals.len(),
);
}
let mut samples: Vec<Sample> = Vec::new();
for (line_no, line) in iter.enumerate() {
let row_no = line_no + 2; let line = line.trim_end_matches('\r');
let cols: Vec<&str> = line.split(DEFAULT_FILE_DELIMETER as char).collect();
ensure!(
cols.len() == header_fields.len(),
"sample metadata row {row_no} has {} columns but header has {}",
cols.len(),
header_fields.len(),
);
let sample_id = cols[sample_id_idx].to_owned();
let barcode = cols[barcode_idx].to_owned();
let read_structures =
parse_per_sample_read_structures(row_no, &cols, &rs_columns, globals)?;
samples.push(Sample::with_read_structures(
samples.len(),
sample_id,
barcode,
read_structures,
));
}
if samples.is_empty() {
bail!("sample metadata file {path:?} contained no sample rows");
}
Self::from_samples(&samples)
}
#[must_use]
pub fn has_per_sample_read_structures(&self) -> bool {
self.samples.iter().any(|s| s.read_structures.is_some())
}
pub fn matching_prefix_lens(&self, default_structures: &[ReadStructure]) -> Result<Vec<usize>> {
let n = default_structures.len();
let mut maxes = vec![0usize; n];
for sample in &self.samples {
let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
ensure!(
rs_for_sample.len() == n,
"sample {}: number of read structures ({}) does not match number of inputs ({})",
sample.sample_id,
rs_for_sample.len(),
n,
);
for (i, rs) in rs_for_sample.iter().enumerate() {
let plen = pre_template_fixed_len(rs).with_context(|| {
let source = if sample.read_structures.is_some() {
format!("sample {}'s `read_structure_{}`", sample.sample_id, i + 1)
} else {
format!(
"the `--read-structures` fallback for sample {} (input {})",
sample.sample_id,
i + 1,
)
};
format!(
"per-sample demultiplexing requires a fixed-length matching window, so \
every segment before the template in {source} must have a fixed length"
)
})?;
if plen > maxes[i] {
maxes[i] = plen;
}
}
}
Ok(maxes)
}
pub fn build_matching_patterns(
&self,
default_structures: &[ReadStructure],
prefix_lens: &[usize],
) -> Result<Vec<Vec<u8>>> {
ensure!(
default_structures.len() == prefix_lens.len(),
"expected one prefix length per input FASTQ"
);
let total_len: usize = prefix_lens.iter().sum();
let mut patterns = Vec::with_capacity(self.samples.len());
for sample in &self.samples {
let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
ensure!(
rs_for_sample.len() == prefix_lens.len(),
"sample {}: number of read structures ({}) does not match number of inputs ({})",
sample.sample_id,
rs_for_sample.len(),
prefix_lens.len(),
);
for (input_idx, rs) in rs_for_sample.iter().enumerate() {
let total_b = rs.segments_by_type(SegmentType::SampleBarcode).count();
let b_before_template = rs
.iter()
.take_while(|seg| seg.kind != SegmentType::Template)
.filter(|seg| seg.kind == SegmentType::SampleBarcode)
.count();
ensure!(
total_b == b_before_template,
"sample {}: all sample-barcode (B) segments must precede the template (T) in \
read structure {} (input {})",
sample.sample_id,
rs,
input_idx + 1,
);
}
let expected_barcode_len: usize = rs_for_sample
.iter()
.flat_map(|rs| rs.segments_by_type(SegmentType::SampleBarcode))
.map(|seg| seg.length.unwrap_or(0))
.sum();
ensure!(
expected_barcode_len == sample.barcode.len(),
"sample {}: {}read structure(s) declare {} sample-barcode (B) base(s) but the \
barcode column has {} base(s)",
sample.sample_id,
if sample.read_structures.is_none() {
"(using --read-structures fallback) "
} else {
""
},
expected_barcode_len,
sample.barcode.len(),
);
let mut pattern = Vec::with_capacity(total_len);
let mut barcode_cursor = 0usize;
let barcode_bytes = sample.barcode.as_bytes();
for (rs, &prefix_len) in rs_for_sample.iter().zip(prefix_lens) {
let mut filled = 0usize;
for seg in rs.iter() {
if seg.kind == SegmentType::Template {
break;
}
let len = seg.length.ok_or_else(|| {
anyhow!(
"sample {}: non-template segment {seg} in read structure must have a \
fixed length",
sample.sample_id,
)
})?;
let remaining = prefix_len - filled;
let take = len.min(remaining);
if seg.kind == SegmentType::SampleBarcode {
ensure!(
take == len,
"sample {}: sample-barcode segment {seg} crosses the matching \
window boundary (segment length {}, but only {} bases remain in \
the {}-base window)",
sample.sample_id,
len,
remaining,
prefix_len,
);
ensure!(
barcode_cursor + len <= barcode_bytes.len(),
"sample {}: barcode ({} bases) is shorter than the total \
sample-barcode length required by its read structure",
sample.sample_id,
barcode_bytes.len(),
);
pattern.extend_from_slice(
&barcode_bytes[barcode_cursor..barcode_cursor + len],
);
barcode_cursor += len;
} else {
pattern.extend(std::iter::repeat_n(b'N', take));
}
filled += take;
if take < len {
break;
}
}
if filled < prefix_len {
pattern.extend(std::iter::repeat_n(b'N', prefix_len - filled));
}
}
ensure!(
barcode_cursor == sample.barcode.len(),
"sample {}: only consumed {} of {} barcode bases when building matching pattern",
sample.sample_id,
barcode_cursor,
sample.barcode.len(),
);
patterns.push(pattern);
}
Ok(patterns)
}
}
fn parse_per_sample_read_structures(
row_no: usize,
cols: &[&str],
rs_columns: &[(usize, usize)],
globals: &[ReadStructure],
) -> Result<Option<Vec<ReadStructure>>> {
if rs_columns.is_empty() {
return Ok(None);
}
let mut entries: Vec<Option<ReadStructure>> = Vec::with_capacity(rs_columns.len());
for (n, idx) in rs_columns {
let raw = cols[*idx].trim();
if raw.is_empty() {
entries.push(None);
} else {
let rs = ReadStructure::from_str(raw).with_context(|| {
format!(
"sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}` has \
invalid read structure `{raw}`",
)
})?;
for seg in rs.segments_by_type(SegmentType::SampleBarcode) {
ensure!(
seg.length.is_some(),
"sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}`: \
sample-barcode segment {seg} must be fixed length (variable-length `+B` \
is not supported in per-sample read structures)",
);
}
entries.push(Some(rs));
}
}
if entries.iter().all(Option::is_none) {
return Ok(None);
}
let resolved: Vec<ReadStructure> = entries
.into_iter()
.enumerate()
.map(|(i, e)| e.unwrap_or_else(|| globals[i].clone()))
.collect();
Ok(Some(resolved))
}
fn strip_bom_and_cr(s: &str) -> &str {
s.strip_prefix('\u{FEFF}').unwrap_or(s).trim_end_matches('\r')
}
fn pre_template_fixed_len(rs: &ReadStructure) -> Result<usize> {
let mut len = 0;
for seg in rs.iter() {
if seg.kind == SegmentType::Template {
return Ok(len);
}
len += seg.length.ok_or_else(|| {
anyhow!("non-template segment {seg} in read structure {rs} must have a fixed length")
})?;
}
Ok(len)
}
#[cfg(test)]
mod tests {
use super::*;
use fgoxide::io::Io;
use std::str::FromStr;
use tempfile::TempDir;
#[test]
fn test_reading_from_tsv_file() {
let lines = vec![
Sample::deserialize_header_line(),
"sample1\tGATTACA".to_owned(),
"sample2\tCATGCTA".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
assert!(samples_metadata.samples[0].sample_id == "sample1");
assert!(samples_metadata.samples[1].sample_id == "sample2");
assert!(samples_metadata.samples[0].barcode == "GATTACA");
assert!(samples_metadata.samples[1].barcode == "CATGCTA");
assert!(!samples_metadata.has_per_sample_read_structures());
}
#[test]
fn test_reading_from_file_with_empty_lines_at_end() {
let lines = vec![
Sample::deserialize_header_line(),
"sample1\tGATTACA".to_owned(),
"sample2\tCATGCTA".to_owned(),
String::new(),
String::new(),
];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
assert!(samples_metadata.samples[0].sample_id == "sample1");
assert!(samples_metadata.samples[1].sample_id == "sample2");
assert!(samples_metadata.samples[0].barcode == "GATTACA");
assert!(samples_metadata.samples[1].barcode == "CATGCTA");
}
#[test]
fn test_new_sample_non_agct_bases_in_barcode_allowed() {
let name = "s_1_example_name".to_owned();
let barcode = "GATTANN".to_owned();
let ordinal = 0;
let _sample = Sample::new(ordinal, name, barcode);
}
#[test]
fn test_tsv_file_delim_error() {
let lines: Vec<String> = ["sample_id,barcode", "sample1,GATTACA", "sample2,CATGCTA"]
.iter()
.map(|&s| s.into())
.collect();
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
}
#[test]
fn test_reading_from_file_with_no_header() {
let lines = vec!["sample1\tGATTACA", "sample2\tCATGCTA"];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
}
#[test]
fn test_reading_header_only_file() {
let lines = vec![Sample::deserialize_header_line()];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("contained no sample rows"), "got: {msg}");
}
#[test]
fn test_reading_empty_file() {
let lines = vec![""];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("is empty") || msg.contains("missing column"), "got: {msg}");
}
#[test]
fn test_reading_from_file_with_duplicate_barcodes_errors() {
let lines = vec!["sample_id\tbarcode", "sample1\tGATTACA", "sample2\tGATTACA"];
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let io = Io::default();
io.write_lines(&f1, &lines).unwrap();
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("Each sample barcode must be unique"), "got: {msg}");
}
#[test]
fn test_reading_non_existent_file() {
let tempdir = TempDir::new().unwrap();
let f1 = tempdir.path().join("sample_metadata.tsv");
let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("failed to read sample metadata file"), "got: {msg}");
}
#[test]
fn test_new_sample_success() {
let name = "s_1_example_name".to_owned();
let barcode = "GATTACA".to_owned();
let ordinal = 0;
let sample = Sample::new(ordinal, name.clone(), barcode.clone());
assert_eq!(
Sample { sample_id: name, barcode, read_structures: None, ordinal },
sample,
"Sample differed from expectation"
);
}
#[test]
#[should_panic(expected = "Sample name cannot be empty")]
fn test_new_sample_fail1_empty_sample_name() {
let name = String::new();
let barcode = "GATTACA".to_owned();
let ordinal = 0;
let _sample = Sample::new(ordinal, name, barcode);
}
#[test]
#[should_panic(expected = "Sample barcode cannot be empty")]
fn test_new_sample_fail2_empty_barcode() {
let name = "s_1_example_name".to_owned();
let barcode = String::new();
let ordinal = 0;
let _sample = Sample::new(ordinal, name, barcode);
}
#[test]
fn test_from_samples_sample_group_pass1_single_sample() {
let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
let samples_vec = vec![sample1.clone()];
let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
assert_eq!(sample_group, SampleGroup { samples: vec![sample1] });
}
#[test]
fn test_from_samples_sample_group_pass2_multi_unique_samples() {
let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
let sample2 = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
let samples_vec = vec![sample1.clone(), sample2.clone()];
let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2] });
}
#[test]
fn test_from_samples_sample_group_pass3_ordinal_values_will_be_changed_by_new() {
let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
let sample2_before = Sample::new(2, "sample_2".to_owned(), "CATGGAT".to_owned());
let sample2_after = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
let samples_vec = vec![sample1.clone(), sample2_before];
let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2_after] });
}
#[test]
fn test_from_samples_sample_group_fail1_no_samples() {
let samples = vec![];
let err = SampleGroup::from_samples(&samples).unwrap_err();
assert!(err.to_string().contains("Must provide one or more sample"), "got: {err:#}");
}
#[test]
fn test_from_samples_sample_group_fail2_duplicate_sample_names() {
let samples = vec![
Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
Sample::new(0, "sample_1".to_owned(), "CATGGAT".to_owned()),
];
let err = SampleGroup::from_samples(&samples).unwrap_err();
assert!(
err.to_string().contains("Each sample name must be unique, duplicate identified"),
"got: {err:#}",
);
}
#[test]
fn test_from_samples_sample_group_fail3_duplicate_barcodes() {
let samples = vec![
Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
Sample::new(0, "sample_2".to_owned(), "GATTACA".to_owned()),
];
let err = SampleGroup::from_samples(&samples).unwrap_err();
assert!(
err.to_string().contains("Each sample barcode must be unique, duplicate identified"),
"got: {err:#}",
);
}
#[test]
fn test_from_samples_sample_group_fail4_barcodes_of_different_lengths() {
let samples = vec![
Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
Sample::new(0, "sample_2".to_owned(), "CATGGA".to_owned()),
];
let err = SampleGroup::from_samples(&samples).unwrap_err();
assert!(err.to_string().contains("All barcodes must have the same length"), "got: {err:#}");
}
fn make_rs(s: &str) -> ReadStructure {
ReadStructure::from_str(s).unwrap()
}
fn sample_with_rs(name: &str, barcode: &str, structures: &[&str]) -> Sample {
let rs = structures.iter().map(|s| make_rs(s)).collect();
Sample::with_read_structures(0, name.to_owned(), barcode.to_owned(), Some(rs))
}
fn write_metadata(tempdir: &TempDir, lines: &[String]) -> std::path::PathBuf {
let f1 = tempdir.path().join("metadata.tsv");
Io::default().write_lines(&f1, lines).unwrap();
f1
}
#[test]
fn test_per_sample_read_structures_round_trip_via_metadata() {
let lines = vec![
"sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
"S1\tGATTACAACGTACG\t3M7B1S+T\t3M7B1S+T".to_owned(),
"S2\tGGGGGGGTTTTTTT\t3M1S7B1S+T\t3M1S7B1S+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M9B+T"), make_rs("9B+T")];
let group = SampleGroup::from_file(&f1, &globals).unwrap();
assert!(group.has_per_sample_read_structures());
let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
assert_eq!(s1_rs.len(), 2);
let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
assert_eq!(s2_rs.len(), 2);
}
#[test]
fn test_per_sample_read_structures_signatures_may_differ_across_samples() {
let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M14B+T"]);
let s2 = sample_with_rs("S2", "TTTTTTTGGGGGGG", &["3M7B7B+T"]);
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
assert!(group.has_per_sample_read_structures());
}
#[test]
fn test_per_sample_read_structures_mixed_with_global_only_samples() {
let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
let s2 = Sample::new(0, "S2".to_owned(), "CCCCCCC".to_owned());
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
assert!(group.has_per_sample_read_structures());
assert!(group.samples[0].read_structures.is_some());
assert!(group.samples[1].read_structures.is_none());
}
#[test]
fn test_per_sample_read_structures_barcode_length_mismatch() {
let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T", "3M7B1S+T"]);
let err = SampleGroup::from_samples(&[s1]).unwrap_err();
assert!(err.to_string().contains("barcode column has"), "got: {err:#}");
}
#[test]
fn test_matching_prefix_lens_uses_max_across_samples() {
let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
let defaults = vec![make_rs("3M9B+T")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
assert_eq!(lens, vec![12]);
}
#[test]
fn test_build_matching_patterns_codec_two_samples() {
let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
let defaults = vec![make_rs("3M9B+T")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
assert_eq!(patterns[0], b"NNNGATTACANN");
assert_eq!(patterns[1], b"NNNNGGGGGGGN");
}
#[test]
fn test_build_matching_patterns_falls_back_to_defaults_when_no_per_sample() {
let s1 = Sample::new(0, "S1".to_owned(), "GATTACA".to_owned());
let group = SampleGroup::from_samples(&[s1]).unwrap();
let defaults = vec![make_rs("3M7B1S+T")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
assert_eq!(lens, vec![11]);
let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
assert_eq!(patterns[0], b"NNNGATTACAN");
}
#[test]
fn test_build_matching_patterns_dual_input_concatenated() {
let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M7B1S+T", "7B+T"]);
let s2 = sample_with_rs("S2", "GGGGGGGTTTTTTT", &["3M1S7B1S+T", "1S7B+T"]);
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
let defaults = vec![make_rs("3M9B+T"), make_rs("9B+T")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
assert_eq!(lens, vec![12, 8]);
let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
let mut expected_s1 = b"NNNGATTACANN".to_vec();
expected_s1.extend_from_slice(b"ACGTACGN");
assert_eq!(patterns[0], expected_s1);
let mut expected_s2 = b"NNNNGGGGGGGN".to_vec();
expected_s2.extend_from_slice(b"NTTTTTTT");
assert_eq!(patterns[1], expected_s2);
}
#[test]
fn test_build_matching_patterns_b_segment_crossing_window_errors() {
let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
let group = SampleGroup::from_samples(&[s1]).unwrap();
let defaults = vec![make_rs("3M7B1S+T")];
let lens = vec![5usize]; let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("crosses the matching window boundary"), "got: {msg}");
}
#[test]
fn test_build_matching_patterns_barcode_shorter_than_b_segments_errors() {
let s1 = sample_with_rs("S1", "GAT", &["3M7B1S+T"]);
let group = SampleGroup { samples: vec![s1] };
let defaults = vec![make_rs("3M7B1S+T")];
let lens = vec![11usize];
let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("declare 7 sample-barcode (B) base(s) but the barcode column has 3"),
"got: {msg}",
);
}
#[test]
fn test_build_matching_patterns_barcode_after_template_errors() {
let s1 = sample_with_rs("S1", "ACGTACGTACGT", &["4B10T8B"]);
let group = SampleGroup::from_samples(&[s1]).unwrap();
let defaults = vec![make_rs("4B10T8B")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("must precede the template"), "got: {msg}");
}
#[test]
fn test_build_matching_patterns_global_only_barcode_mismatch_errors() {
let s1 = sample_with_rs("S1", "GATTAC", &["3M6B1S+T"]);
let s2 = Sample::new(0, "S2".to_owned(), "CCCCCC".to_owned());
let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
let defaults = vec![make_rs("3M7B1S+T")];
let lens = group.matching_prefix_lens(&defaults).unwrap();
let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("--read-structures fallback") && msg.contains("S2"), "got: {msg}",);
}
#[test]
fn test_matching_prefix_lens_variable_pre_template_errors() {
let s1 = sample_with_rs("S1", "ACGTACGT", &["8B+M"]);
let group = SampleGroup::from_samples(&[s1]).unwrap();
let defaults = vec![make_rs("8B+M")];
let err = group.matching_prefix_lens(&defaults).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("fixed-length matching window"), "got: {msg}");
}
#[test]
fn test_per_cell_fallback_uses_globals_for_blank_cells() {
let lines = vec![
"sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
"S1\tGATTACAGGGGGGG\t3M7B1S+T\t".to_owned(),
"S2\tCCCCCCCAAAAAAA\t\t1S7B+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
let group = SampleGroup::from_file(&f1, &globals).unwrap();
let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
assert_eq!(s1_rs.len(), 2);
assert_eq!(s1_rs[0].to_string(), "3M7B1S+T");
assert_eq!(s1_rs[1].to_string(), "7B+T");
let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
assert_eq!(s2_rs[0].to_string(), "3M7B+T");
assert_eq!(s2_rs[1].to_string(), "1S7B+T");
}
#[test]
fn test_per_cell_all_blank_row_falls_back_to_globals_entirely() {
let lines = vec![
"sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
"S1\tGATTACAGGGGGGG\t3M7B1S+T\t3M7B1S+T".to_owned(),
"S2\tCCCCCCCAAAAAAA\t\t".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T"), make_rs("3M7B+T")];
let group = SampleGroup::from_file(&f1, &globals).unwrap();
assert!(group.samples[0].read_structures.is_some());
assert!(group.samples[1].read_structures.is_none());
}
#[test]
fn test_per_sample_column_count_must_match_globals() {
let lines = vec![
"sample_id\tbarcode\tread_structure_1".to_owned(),
"S1\tGATTACA\t3M7B1S+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T"), make_rs("100T")];
let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("`read_structure_<n>` column(s)") && msg.contains("--read-structures"),
"got: {msg}",
);
}
#[test]
fn test_per_sample_variable_length_b_segment_errors() {
let lines =
vec!["sample_id\tbarcode\tread_structure_1".to_owned(), "S1\tGATTACA\t3M+B".to_owned()];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T")];
let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("must be fixed length"), "got: {msg}");
}
#[test]
fn test_header_with_utf8_bom_is_handled() {
let lines = vec![
format!("\u{FEFF}{}", Sample::deserialize_header_line()),
"sample1\tGATTACA".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let group = SampleGroup::from_file(&f1, &[]).unwrap();
assert_eq!(group.samples[0].sample_id, "sample1");
assert_eq!(group.samples[0].barcode, "GATTACA");
}
#[test]
fn test_rows_with_crlf_endings_are_handled() {
let header = format!("{}\r", Sample::deserialize_header_line());
let lines = vec![header, "sample1\tGATTACA\r".to_owned(), "sample2\tCATGCTA\r".to_owned()];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let group = SampleGroup::from_file(&f1, &[]).unwrap();
assert_eq!(group.samples[0].barcode, "GATTACA");
assert_eq!(group.samples[1].barcode, "CATGCTA");
}
#[test]
fn test_matching_prefix_lens_errors_on_rs_count_mismatch() {
let s1 = sample_with_rs("S1", "GATTACAGGGGGGG", &["3M7B1S+T", "7B+T"]);
let group = SampleGroup::from_samples(&[s1]).unwrap();
let defaults = vec![make_rs("3M7B+T")];
let err = group.matching_prefix_lens(&defaults).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("number of read structures") && msg.contains("number of inputs"),
"got: {msg}",
);
}
#[test]
fn test_per_sample_columns_must_be_contiguous() {
let lines = vec![
"sample_id\tbarcode\tread_structure_1\tread_structure_3".to_owned(),
"S1\tGATTACA\t3M7B1S+T\t1S7B+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("contiguous"), "got: {msg}");
}
#[test]
fn test_per_sample_columns_must_have_integer_suffix() {
let lines = vec![
"sample_id\tbarcode\tread_structure_abc".to_owned(),
"S1\tGATTACA\t3M7B1S+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T")];
let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("non-integer suffix"), "got: {msg}");
}
#[test]
fn test_per_sample_columns_must_be_one_indexed() {
let lines = vec![
"sample_id\tbarcode\tread_structure_0".to_owned(),
"S1\tGATTACA\t3M7B1S+T".to_owned(),
];
let tempdir = TempDir::new().unwrap();
let f1 = write_metadata(&tempdir, &lines);
let globals = vec![make_rs("3M7B+T")];
let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("1-based indexing"), "got: {msg}");
}
}