1use crate::is_valid_iupac;
2
3use anyhow::{Context, Result, anyhow, bail, ensure};
4use fgoxide::io::Io;
5use itertools::Itertools;
6use read_structure::{ReadStructure, SegmentType};
7use std::fmt::{self, Display};
8use std::path::Path;
9use std::str::FromStr;
10
11const DEFAULT_FILE_DELIMETER: u8 = b'\t';
12const SAMPLE_ID_HEADER: &str = "sample_id";
13const BARCODE_HEADER: &str = "barcode";
14const READ_STRUCTURE_PREFIX: &str = "read_structure_";
15
16#[derive(Clone, Debug, PartialEq)]
18pub struct Sample {
19 pub sample_id: String,
21 pub barcode: String,
23 pub read_structures: Option<Vec<ReadStructure>>,
27 pub(crate) ordinal: usize,
30}
31
32impl Display for Sample {
33 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37 write!(
38 f,
39 "Sample({:04}) - {{ name: {}\tbarcode: {} }}",
40 self.ordinal, self.sample_id, self.barcode
41 )
42 }
43}
44
45impl Sample {
46 #[must_use]
54 pub fn new(ordinal: usize, name: String, barcode: String) -> Self {
55 Self::with_read_structures(ordinal, name, barcode, None)
56 }
57
58 #[must_use]
60 pub fn with_read_structures(
61 ordinal: usize,
62 name: String,
63 barcode: String,
64 read_structures: Option<Vec<ReadStructure>>,
65 ) -> Self {
66 assert!(!name.is_empty(), "Sample name cannot be empty");
67 assert!(!barcode.is_empty(), "Sample barcode cannot be empty");
68 assert!(
69 barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
70 "All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
71 );
72 Self { sample_id: name, barcode, read_structures, ordinal }
73 }
74
75 #[must_use]
78 pub fn deserialize_header_line() -> String {
79 format!("{SAMPLE_ID_HEADER}\t{BARCODE_HEADER}")
80 }
81}
82
83#[derive(Clone, Debug, PartialEq)]
86pub struct SampleGroup {
87 pub samples: Vec<Sample>,
89}
90
91impl Display for SampleGroup {
92 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93 writeln!(f, "SampleGroup {{")?;
94 for sample in &self.samples {
95 writeln!(f, " {sample}")?;
96 }
97 writeln!(f, "}}")
98 }
99}
100
101impl SampleGroup {
102 pub fn from_samples(samples: &[Sample]) -> Result<Self> {
120 ensure!(!samples.is_empty(), "Must provide one or more sample");
121
122 ensure!(
123 samples.iter().map(|s| &s.sample_id).all_unique(),
124 "Each sample name must be unique, duplicate identified"
125 );
126
127 ensure!(
128 samples.iter().map(|s| &s.barcode).all_unique(),
129 "Each sample barcode must be unique, duplicate identified",
130 );
131
132 let first_barcode_length = samples[0].barcode.len();
133 ensure!(
134 samples.iter().map(|s| &s.barcode).all(|b| b.len() == first_barcode_length),
135 "All barcodes must have the same length",
136 );
137
138 for sample in samples {
141 let Some(rs) = sample.read_structures.as_ref() else { continue };
142 let mut b_len: usize = 0;
143 for seg in rs.iter().flat_map(|r| r.segments_by_type(SegmentType::SampleBarcode)) {
144 let len = seg.length.ok_or_else(|| {
145 anyhow!(
146 "Sample {}: sample-barcode (B) segments in per-sample read structures \
147 must be fixed length",
148 sample.sample_id,
149 )
150 })?;
151 b_len += len;
152 }
153 ensure!(
154 b_len == sample.barcode.len(),
155 "Sample {}: total sample-barcode (B) length across per-sample read structures \
156 is {} but barcode column has {} bases",
157 sample.sample_id,
158 b_len,
159 sample.barcode.len(),
160 );
161 }
162
163 Ok(Self {
164 samples: samples
165 .iter()
166 .enumerate()
167 .map(|(ordinal, sample)| {
168 Sample::with_read_structures(
169 ordinal,
170 sample.sample_id.clone(),
171 sample.barcode.clone(),
172 sample.read_structures.clone(),
173 )
174 })
175 .collect(),
176 })
177 }
178
179 pub fn from_file<P: AsRef<Path>>(path: P, globals: &[ReadStructure]) -> Result<SampleGroup> {
203 let path = path.as_ref();
204 let io = Io::default();
205 let lines = io
206 .read_lines(path)
207 .with_context(|| format!("failed to read sample metadata file {path:?}"))?;
208 let mut iter = lines.into_iter().filter(|l| !l.trim().is_empty());
209
210 let header = iter.next().ok_or_else(|| {
211 anyhow!(
212 "sample metadata file {path:?} is empty (expected header line {})",
213 Sample::deserialize_header_line(),
214 )
215 })?;
216 let header = strip_bom_and_cr(&header);
219 let header_fields: Vec<&str> = header.split(DEFAULT_FILE_DELIMETER as char).collect();
220
221 let sample_id_idx =
222 header_fields.iter().position(|c| *c == SAMPLE_ID_HEADER).ok_or_else(|| {
223 anyhow!("sample metadata header is missing column `{SAMPLE_ID_HEADER}`")
224 })?;
225 let barcode_idx =
226 header_fields.iter().position(|c| *c == BARCODE_HEADER).ok_or_else(|| {
227 anyhow!("sample metadata header is missing column `{BARCODE_HEADER}`")
228 })?;
229
230 let mut rs_columns: Vec<(usize, usize)> = Vec::new(); for (idx, name) in header_fields.iter().enumerate() {
234 if let Some(suffix) = name.strip_prefix(READ_STRUCTURE_PREFIX) {
235 let n: usize = suffix
236 .parse()
237 .with_context(|| format!("metadata column `{name}` has non-integer suffix"))?;
238 ensure!(n >= 1, "metadata column `{name}` must use 1-based indexing");
239 rs_columns.push((n, idx));
240 }
241 }
242 rs_columns.sort_by_key(|(n, _)| *n);
243 for (i, (n, _)) in rs_columns.iter().enumerate() {
244 ensure!(
245 *n == i + 1,
246 "per-sample read structure columns must be contiguous starting at \
247 `{READ_STRUCTURE_PREFIX}1` (found `{READ_STRUCTURE_PREFIX}{n}` at position {})",
248 i + 1,
249 );
250 }
251 if !rs_columns.is_empty() {
252 ensure!(
253 rs_columns.len() == globals.len(),
254 "metadata file has {} `{READ_STRUCTURE_PREFIX}<n>` column(s) but \
255 `--read-structures` has {} entry/entries",
256 rs_columns.len(),
257 globals.len(),
258 );
259 }
260
261 let mut samples: Vec<Sample> = Vec::new();
262 for (line_no, line) in iter.enumerate() {
263 let row_no = line_no + 2; let line = line.trim_end_matches('\r');
265 let cols: Vec<&str> = line.split(DEFAULT_FILE_DELIMETER as char).collect();
266 ensure!(
267 cols.len() == header_fields.len(),
268 "sample metadata row {row_no} has {} columns but header has {}",
269 cols.len(),
270 header_fields.len(),
271 );
272 let sample_id = cols[sample_id_idx].to_owned();
273 let barcode = cols[barcode_idx].to_owned();
274 let read_structures =
275 parse_per_sample_read_structures(row_no, &cols, &rs_columns, globals)?;
276 samples.push(Sample::with_read_structures(
277 samples.len(),
278 sample_id,
279 barcode,
280 read_structures,
281 ));
282 }
283
284 if samples.is_empty() {
285 bail!("sample metadata file {path:?} contained no sample rows");
286 }
287 Self::from_samples(&samples)
288 }
289
290 #[must_use]
293 pub fn has_per_sample_read_structures(&self) -> bool {
294 self.samples.iter().any(|s| s.read_structures.is_some())
295 }
296
297 pub fn matching_prefix_lens(&self, default_structures: &[ReadStructure]) -> Result<Vec<usize>> {
309 let n = default_structures.len();
310 let mut maxes = vec![0usize; n];
311 for sample in &self.samples {
312 let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
313 ensure!(
314 rs_for_sample.len() == n,
315 "sample {}: number of read structures ({}) does not match number of inputs ({})",
316 sample.sample_id,
317 rs_for_sample.len(),
318 n,
319 );
320 for (i, rs) in rs_for_sample.iter().enumerate() {
321 let plen = pre_template_fixed_len(rs).with_context(|| {
322 let source = if sample.read_structures.is_some() {
323 format!("sample {}'s `read_structure_{}`", sample.sample_id, i + 1)
324 } else {
325 format!(
326 "the `--read-structures` fallback for sample {} (input {})",
327 sample.sample_id,
328 i + 1,
329 )
330 };
331 format!(
332 "per-sample demultiplexing requires a fixed-length matching window, so \
333 every segment before the template in {source} must have a fixed length"
334 )
335 })?;
336 if plen > maxes[i] {
337 maxes[i] = plen;
338 }
339 }
340 }
341 Ok(maxes)
342 }
343
344 pub fn build_matching_patterns(
358 &self,
359 default_structures: &[ReadStructure],
360 prefix_lens: &[usize],
361 ) -> Result<Vec<Vec<u8>>> {
362 ensure!(
363 default_structures.len() == prefix_lens.len(),
364 "expected one prefix length per input FASTQ"
365 );
366 let total_len: usize = prefix_lens.iter().sum();
367 let mut patterns = Vec::with_capacity(self.samples.len());
368 for sample in &self.samples {
369 let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
370 ensure!(
371 rs_for_sample.len() == prefix_lens.len(),
372 "sample {}: number of read structures ({}) does not match number of inputs ({})",
373 sample.sample_id,
374 rs_for_sample.len(),
375 prefix_lens.len(),
376 );
377
378 for (input_idx, rs) in rs_for_sample.iter().enumerate() {
382 let total_b = rs.segments_by_type(SegmentType::SampleBarcode).count();
383 let b_before_template = rs
384 .iter()
385 .take_while(|seg| seg.kind != SegmentType::Template)
386 .filter(|seg| seg.kind == SegmentType::SampleBarcode)
387 .count();
388 ensure!(
389 total_b == b_before_template,
390 "sample {}: all sample-barcode (B) segments must precede the template (T) in \
391 read structure {} (input {})",
392 sample.sample_id,
393 rs,
394 input_idx + 1,
395 );
396 }
397
398 let expected_barcode_len: usize = rs_for_sample
403 .iter()
404 .flat_map(|rs| rs.segments_by_type(SegmentType::SampleBarcode))
405 .map(|seg| seg.length.unwrap_or(0))
406 .sum();
407 ensure!(
408 expected_barcode_len == sample.barcode.len(),
409 "sample {}: {}read structure(s) declare {} sample-barcode (B) base(s) but the \
410 barcode column has {} base(s)",
411 sample.sample_id,
412 if sample.read_structures.is_none() {
413 "(using --read-structures fallback) "
414 } else {
415 ""
416 },
417 expected_barcode_len,
418 sample.barcode.len(),
419 );
420
421 let mut pattern = Vec::with_capacity(total_len);
422 let mut barcode_cursor = 0usize;
423 let barcode_bytes = sample.barcode.as_bytes();
424 for (rs, &prefix_len) in rs_for_sample.iter().zip(prefix_lens) {
425 let mut filled = 0usize;
426 for seg in rs.iter() {
427 if seg.kind == SegmentType::Template {
428 break;
429 }
430 let len = seg.length.ok_or_else(|| {
431 anyhow!(
432 "sample {}: non-template segment {seg} in read structure must have a \
433 fixed length",
434 sample.sample_id,
435 )
436 })?;
437 let remaining = prefix_len - filled;
438 let take = len.min(remaining);
439 if seg.kind == SegmentType::SampleBarcode {
440 ensure!(
441 take == len,
442 "sample {}: sample-barcode segment {seg} crosses the matching \
443 window boundary (segment length {}, but only {} bases remain in \
444 the {}-base window)",
445 sample.sample_id,
446 len,
447 remaining,
448 prefix_len,
449 );
450 ensure!(
451 barcode_cursor + len <= barcode_bytes.len(),
452 "sample {}: barcode ({} bases) is shorter than the total \
453 sample-barcode length required by its read structure",
454 sample.sample_id,
455 barcode_bytes.len(),
456 );
457 pattern.extend_from_slice(
458 &barcode_bytes[barcode_cursor..barcode_cursor + len],
459 );
460 barcode_cursor += len;
461 } else {
462 pattern.extend(std::iter::repeat_n(b'N', take));
463 }
464 filled += take;
465 if take < len {
466 break;
467 }
468 }
469 if filled < prefix_len {
470 pattern.extend(std::iter::repeat_n(b'N', prefix_len - filled));
471 }
472 }
473 ensure!(
474 barcode_cursor == sample.barcode.len(),
475 "sample {}: only consumed {} of {} barcode bases when building matching pattern",
476 sample.sample_id,
477 barcode_cursor,
478 sample.barcode.len(),
479 );
480 patterns.push(pattern);
481 }
482 Ok(patterns)
483 }
484}
485
486fn parse_per_sample_read_structures(
494 row_no: usize,
495 cols: &[&str],
496 rs_columns: &[(usize, usize)],
497 globals: &[ReadStructure],
498) -> Result<Option<Vec<ReadStructure>>> {
499 if rs_columns.is_empty() {
500 return Ok(None);
501 }
502 let mut entries: Vec<Option<ReadStructure>> = Vec::with_capacity(rs_columns.len());
503 for (n, idx) in rs_columns {
504 let raw = cols[*idx].trim();
505 if raw.is_empty() {
506 entries.push(None);
507 } else {
508 let rs = ReadStructure::from_str(raw).with_context(|| {
509 format!(
510 "sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}` has \
511 invalid read structure `{raw}`",
512 )
513 })?;
514 for seg in rs.segments_by_type(SegmentType::SampleBarcode) {
515 ensure!(
516 seg.length.is_some(),
517 "sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}`: \
518 sample-barcode segment {seg} must be fixed length (variable-length `+B` \
519 is not supported in per-sample read structures)",
520 );
521 }
522 entries.push(Some(rs));
523 }
524 }
525 if entries.iter().all(Option::is_none) {
526 return Ok(None);
527 }
528 let resolved: Vec<ReadStructure> = entries
529 .into_iter()
530 .enumerate()
531 .map(|(i, e)| e.unwrap_or_else(|| globals[i].clone()))
532 .collect();
533 Ok(Some(resolved))
534}
535
536fn strip_bom_and_cr(s: &str) -> &str {
539 s.strip_prefix('\u{FEFF}').unwrap_or(s).trim_end_matches('\r')
540}
541
542fn pre_template_fixed_len(rs: &ReadStructure) -> Result<usize> {
548 let mut len = 0;
549 for seg in rs.iter() {
550 if seg.kind == SegmentType::Template {
551 return Ok(len);
552 }
553 len += seg.length.ok_or_else(|| {
554 anyhow!("non-template segment {seg} in read structure {rs} must have a fixed length")
555 })?;
556 }
557 Ok(len)
558}
559
560#[cfg(test)]
561mod tests {
562 use super::*;
563 use fgoxide::io::Io;
564 use std::str::FromStr;
565 use tempfile::TempDir;
566
567 #[test]
571 fn test_reading_from_tsv_file() {
572 let lines = vec![
573 Sample::deserialize_header_line(),
574 "sample1\tGATTACA".to_owned(),
575 "sample2\tCATGCTA".to_owned(),
576 ];
577 let tempdir = TempDir::new().unwrap();
578 let f1 = tempdir.path().join("sample_metadata.tsv");
579
580 let io = Io::default();
581 io.write_lines(&f1, &lines).unwrap();
582 let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
583
584 assert!(samples_metadata.samples[0].sample_id == "sample1");
585 assert!(samples_metadata.samples[1].sample_id == "sample2");
586 assert!(samples_metadata.samples[0].barcode == "GATTACA");
587 assert!(samples_metadata.samples[1].barcode == "CATGCTA");
588 assert!(!samples_metadata.has_per_sample_read_structures());
589 }
590
591 #[test]
592 fn test_reading_from_file_with_empty_lines_at_end() {
593 let lines = vec![
594 Sample::deserialize_header_line(),
595 "sample1\tGATTACA".to_owned(),
596 "sample2\tCATGCTA".to_owned(),
597 String::new(),
598 String::new(),
599 ];
600 let tempdir = TempDir::new().unwrap();
601 let f1 = tempdir.path().join("sample_metadata.tsv");
602
603 let io = Io::default();
604 io.write_lines(&f1, &lines).unwrap();
605 let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
606
607 assert!(samples_metadata.samples[0].sample_id == "sample1");
608 assert!(samples_metadata.samples[1].sample_id == "sample2");
609 assert!(samples_metadata.samples[0].barcode == "GATTACA");
610 assert!(samples_metadata.samples[1].barcode == "CATGCTA");
611 }
612
613 #[test]
614 fn test_new_sample_non_agct_bases_in_barcode_allowed() {
615 let name = "s_1_example_name".to_owned();
616 let barcode = "GATTANN".to_owned();
617 let ordinal = 0;
618 let _sample = Sample::new(ordinal, name, barcode);
619 }
620
621 #[test]
622 fn test_tsv_file_delim_error() {
623 let lines: Vec<String> = ["sample_id,barcode", "sample1,GATTACA", "sample2,CATGCTA"]
624 .iter()
625 .map(|&s| s.into())
626 .collect();
627 let tempdir = TempDir::new().unwrap();
628 let f1 = tempdir.path().join("sample_metadata.tsv");
629
630 let io = Io::default();
631 io.write_lines(&f1, &lines).unwrap();
632 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
633 let msg = format!("{err:#}");
634 assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
635 }
636
637 #[test]
641 fn test_reading_from_file_with_no_header() {
642 let lines = vec!["sample1\tGATTACA", "sample2\tCATGCTA"];
643 let tempdir = TempDir::new().unwrap();
644 let f1 = tempdir.path().join("sample_metadata.tsv");
645
646 let io = Io::default();
647 io.write_lines(&f1, &lines).unwrap();
648 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
649 let msg = format!("{err:#}");
650 assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
651 }
652
653 #[test]
654 fn test_reading_header_only_file() {
655 let lines = vec![Sample::deserialize_header_line()];
656 let tempdir = TempDir::new().unwrap();
657 let f1 = tempdir.path().join("sample_metadata.tsv");
658
659 let io = Io::default();
660 io.write_lines(&f1, &lines).unwrap();
661 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
662 let msg = format!("{err:#}");
663 assert!(msg.contains("contained no sample rows"), "got: {msg}");
664 }
665
666 #[test]
667 fn test_reading_empty_file() {
668 let lines = vec![""];
669 let tempdir = TempDir::new().unwrap();
670 let f1 = tempdir.path().join("sample_metadata.tsv");
671
672 let io = Io::default();
673 io.write_lines(&f1, &lines).unwrap();
674 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
675 let msg = format!("{err:#}");
676 assert!(msg.contains("is empty") || msg.contains("missing column"), "got: {msg}");
677 }
678
679 #[test]
680 fn test_reading_from_file_with_duplicate_barcodes_errors() {
681 let lines = vec!["sample_id\tbarcode", "sample1\tGATTACA", "sample2\tGATTACA"];
683 let tempdir = TempDir::new().unwrap();
684 let f1 = tempdir.path().join("sample_metadata.tsv");
685
686 let io = Io::default();
687 io.write_lines(&f1, &lines).unwrap();
688 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
689 let msg = format!("{err:#}");
690 assert!(msg.contains("Each sample barcode must be unique"), "got: {msg}");
691 }
692
693 #[test]
694 fn test_reading_non_existent_file() {
695 let tempdir = TempDir::new().unwrap();
696 let f1 = tempdir.path().join("sample_metadata.tsv");
697 let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
698 let msg = format!("{err:#}");
699 assert!(msg.contains("failed to read sample metadata file"), "got: {msg}");
700 }
701
702 #[test]
706 fn test_new_sample_success() {
707 let name = "s_1_example_name".to_owned();
708 let barcode = "GATTACA".to_owned();
709 let ordinal = 0;
710 let sample = Sample::new(ordinal, name.clone(), barcode.clone());
711 assert_eq!(
712 Sample { sample_id: name, barcode, read_structures: None, ordinal },
713 sample,
714 "Sample differed from expectation"
715 );
716 }
717
718 #[test]
722 #[should_panic(expected = "Sample name cannot be empty")]
723 fn test_new_sample_fail1_empty_sample_name() {
724 let name = String::new();
725 let barcode = "GATTACA".to_owned();
726 let ordinal = 0;
727 let _sample = Sample::new(ordinal, name, barcode);
728 }
729
730 #[test]
731 #[should_panic(expected = "Sample barcode cannot be empty")]
732 fn test_new_sample_fail2_empty_barcode() {
733 let name = "s_1_example_name".to_owned();
734 let barcode = String::new();
735 let ordinal = 0;
736 let _sample = Sample::new(ordinal, name, barcode);
737 }
738
739 #[test]
743 fn test_from_samples_sample_group_pass1_single_sample() {
744 let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
745 let samples_vec = vec![sample1.clone()];
746 let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
747
748 assert_eq!(sample_group, SampleGroup { samples: vec![sample1] });
749 }
750
751 #[test]
752 fn test_from_samples_sample_group_pass2_multi_unique_samples() {
753 let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
754 let sample2 = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
755 let samples_vec = vec![sample1.clone(), sample2.clone()];
756 let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
757
758 assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2] });
759 }
760
761 #[test]
762 fn test_from_samples_sample_group_pass3_ordinal_values_will_be_changed_by_new() {
763 let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
764 let sample2_before = Sample::new(2, "sample_2".to_owned(), "CATGGAT".to_owned());
765 let sample2_after = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
766 let samples_vec = vec![sample1.clone(), sample2_before];
767 let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
768
769 assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2_after] });
770 }
771
772 #[test]
776 fn test_from_samples_sample_group_fail1_no_samples() {
777 let samples = vec![];
778 let err = SampleGroup::from_samples(&samples).unwrap_err();
779 assert!(err.to_string().contains("Must provide one or more sample"), "got: {err:#}");
780 }
781
782 #[test]
783 fn test_from_samples_sample_group_fail2_duplicate_sample_names() {
784 let samples = vec![
785 Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
786 Sample::new(0, "sample_1".to_owned(), "CATGGAT".to_owned()),
787 ];
788 let err = SampleGroup::from_samples(&samples).unwrap_err();
789 assert!(
790 err.to_string().contains("Each sample name must be unique, duplicate identified"),
791 "got: {err:#}",
792 );
793 }
794
795 #[test]
796 fn test_from_samples_sample_group_fail3_duplicate_barcodes() {
797 let samples = vec![
798 Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
799 Sample::new(0, "sample_2".to_owned(), "GATTACA".to_owned()),
800 ];
801 let err = SampleGroup::from_samples(&samples).unwrap_err();
802 assert!(
803 err.to_string().contains("Each sample barcode must be unique, duplicate identified"),
804 "got: {err:#}",
805 );
806 }
807
808 #[test]
809 fn test_from_samples_sample_group_fail4_barcodes_of_different_lengths() {
810 let samples = vec![
811 Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
812 Sample::new(0, "sample_2".to_owned(), "CATGGA".to_owned()),
813 ];
814 let err = SampleGroup::from_samples(&samples).unwrap_err();
815 assert!(err.to_string().contains("All barcodes must have the same length"), "got: {err:#}");
816 }
817
818 fn make_rs(s: &str) -> ReadStructure {
822 ReadStructure::from_str(s).unwrap()
823 }
824
825 fn sample_with_rs(name: &str, barcode: &str, structures: &[&str]) -> Sample {
826 let rs = structures.iter().map(|s| make_rs(s)).collect();
827 Sample::with_read_structures(0, name.to_owned(), barcode.to_owned(), Some(rs))
828 }
829
830 fn write_metadata(tempdir: &TempDir, lines: &[String]) -> std::path::PathBuf {
831 let f1 = tempdir.path().join("metadata.tsv");
832 Io::default().write_lines(&f1, lines).unwrap();
833 f1
834 }
835
836 #[test]
837 fn test_per_sample_read_structures_round_trip_via_metadata() {
838 let lines = vec![
840 "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
841 "S1\tGATTACAACGTACG\t3M7B1S+T\t3M7B1S+T".to_owned(),
842 "S2\tGGGGGGGTTTTTTT\t3M1S7B1S+T\t3M1S7B1S+T".to_owned(),
843 ];
844 let tempdir = TempDir::new().unwrap();
845 let f1 = write_metadata(&tempdir, &lines);
846 let globals = vec![make_rs("3M9B+T"), make_rs("9B+T")];
847 let group = SampleGroup::from_file(&f1, &globals).unwrap();
848 assert!(group.has_per_sample_read_structures());
849 let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
850 assert_eq!(s1_rs.len(), 2);
851 let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
852 assert_eq!(s2_rs.len(), 2);
853 }
854
855 #[test]
858 fn test_per_sample_read_structures_signatures_may_differ_across_samples() {
859 let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M14B+T"]);
861 let s2 = sample_with_rs("S2", "TTTTTTTGGGGGGG", &["3M7B7B+T"]);
862 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
863 assert!(group.has_per_sample_read_structures());
864 }
865
866 #[test]
868 fn test_per_sample_read_structures_mixed_with_global_only_samples() {
869 let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
870 let s2 = Sample::new(0, "S2".to_owned(), "CCCCCCC".to_owned());
871 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
872 assert!(group.has_per_sample_read_structures());
873 assert!(group.samples[0].read_structures.is_some());
874 assert!(group.samples[1].read_structures.is_none());
875 }
876
877 #[test]
878 fn test_per_sample_read_structures_barcode_length_mismatch() {
879 let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T", "3M7B1S+T"]);
881 let err = SampleGroup::from_samples(&[s1]).unwrap_err();
882 assert!(err.to_string().contains("barcode column has"), "got: {err:#}");
883 }
884
885 #[test]
886 fn test_matching_prefix_lens_uses_max_across_samples() {
887 let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
889 let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
890 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
891 let defaults = vec![make_rs("3M9B+T")];
892 let lens = group.matching_prefix_lens(&defaults).unwrap();
893 assert_eq!(lens, vec![12]);
894 }
895
896 #[test]
897 fn test_build_matching_patterns_codec_two_samples() {
898 let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
899 let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
900 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
901 let defaults = vec![make_rs("3M9B+T")];
902 let lens = group.matching_prefix_lens(&defaults).unwrap();
903 let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
904 assert_eq!(patterns[0], b"NNNGATTACANN");
907 assert_eq!(patterns[1], b"NNNNGGGGGGGN");
909 }
910
911 #[test]
912 fn test_build_matching_patterns_falls_back_to_defaults_when_no_per_sample() {
913 let s1 = Sample::new(0, "S1".to_owned(), "GATTACA".to_owned());
914 let group = SampleGroup::from_samples(&[s1]).unwrap();
915 let defaults = vec![make_rs("3M7B1S+T")];
916 let lens = group.matching_prefix_lens(&defaults).unwrap();
917 assert_eq!(lens, vec![11]);
918 let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
919 assert_eq!(patterns[0], b"NNNGATTACAN");
920 }
921
922 #[test]
923 fn test_build_matching_patterns_dual_input_concatenated() {
924 let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M7B1S+T", "7B+T"]);
927 let s2 = sample_with_rs("S2", "GGGGGGGTTTTTTT", &["3M1S7B1S+T", "1S7B+T"]);
928 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
929 let defaults = vec![make_rs("3M9B+T"), make_rs("9B+T")];
930 let lens = group.matching_prefix_lens(&defaults).unwrap();
931 assert_eq!(lens, vec![12, 8]);
933 let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
934 let mut expected_s1 = b"NNNGATTACANN".to_vec();
936 expected_s1.extend_from_slice(b"ACGTACGN");
937 assert_eq!(patterns[0], expected_s1);
938 let mut expected_s2 = b"NNNNGGGGGGGN".to_vec();
939 expected_s2.extend_from_slice(b"NTTTTTTT");
940 assert_eq!(patterns[1], expected_s2);
941 }
942
943 #[test]
946 fn test_build_matching_patterns_b_segment_crossing_window_errors() {
947 let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
948 let group = SampleGroup::from_samples(&[s1]).unwrap();
949 let defaults = vec![make_rs("3M7B1S+T")];
950 let lens = vec![5usize]; let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
952 let msg = format!("{err:#}");
953 assert!(msg.contains("crosses the matching window boundary"), "got: {msg}");
954 }
955
956 #[test]
960 fn test_build_matching_patterns_barcode_shorter_than_b_segments_errors() {
961 let s1 = sample_with_rs("S1", "GAT", &["3M7B1S+T"]);
963 let group = SampleGroup { samples: vec![s1] };
964 let defaults = vec![make_rs("3M7B1S+T")];
965 let lens = vec![11usize];
966 let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
967 let msg = format!("{err:#}");
968 assert!(
969 msg.contains("declare 7 sample-barcode (B) base(s) but the barcode column has 3"),
970 "got: {msg}",
971 );
972 }
973
974 #[test]
977 fn test_build_matching_patterns_barcode_after_template_errors() {
978 let s1 = sample_with_rs("S1", "ACGTACGTACGT", &["4B10T8B"]);
980 let group = SampleGroup::from_samples(&[s1]).unwrap();
981 let defaults = vec![make_rs("4B10T8B")];
982 let lens = group.matching_prefix_lens(&defaults).unwrap();
983 let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
984 let msg = format!("{err:#}");
985 assert!(msg.contains("must precede the template"), "got: {msg}");
986 }
987
988 #[test]
991 fn test_build_matching_patterns_global_only_barcode_mismatch_errors() {
992 let s1 = sample_with_rs("S1", "GATTAC", &["3M6B1S+T"]);
995 let s2 = Sample::new(0, "S2".to_owned(), "CCCCCC".to_owned());
996 let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
997 let defaults = vec![make_rs("3M7B1S+T")];
998 let lens = group.matching_prefix_lens(&defaults).unwrap();
999 let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
1000 let msg = format!("{err:#}");
1001 assert!(msg.contains("--read-structures fallback") && msg.contains("S2"), "got: {msg}",);
1002 }
1003
1004 #[test]
1007 fn test_matching_prefix_lens_variable_pre_template_errors() {
1008 let s1 = sample_with_rs("S1", "ACGTACGT", &["8B+M"]);
1009 let group = SampleGroup::from_samples(&[s1]).unwrap();
1010 let defaults = vec![make_rs("8B+M")];
1011 let err = group.matching_prefix_lens(&defaults).unwrap_err();
1012 let msg = format!("{err:#}");
1013 assert!(msg.contains("fixed-length matching window"), "got: {msg}");
1014 }
1015
1016 #[test]
1021 fn test_per_cell_fallback_uses_globals_for_blank_cells() {
1022 let lines = vec![
1023 "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
1024 "S1\tGATTACAGGGGGGG\t3M7B1S+T\t".to_owned(),
1026 "S2\tCCCCCCCAAAAAAA\t\t1S7B+T".to_owned(),
1028 ];
1029 let tempdir = TempDir::new().unwrap();
1030 let f1 = write_metadata(&tempdir, &lines);
1031 let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
1032 let group = SampleGroup::from_file(&f1, &globals).unwrap();
1033 let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
1034 assert_eq!(s1_rs.len(), 2);
1035 assert_eq!(s1_rs[0].to_string(), "3M7B1S+T");
1036 assert_eq!(s1_rs[1].to_string(), "7B+T");
1037 let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
1038 assert_eq!(s2_rs[0].to_string(), "3M7B+T");
1039 assert_eq!(s2_rs[1].to_string(), "1S7B+T");
1040 }
1041
1042 #[test]
1045 fn test_per_cell_all_blank_row_falls_back_to_globals_entirely() {
1046 let lines = vec![
1047 "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
1048 "S1\tGATTACAGGGGGGG\t3M7B1S+T\t3M7B1S+T".to_owned(),
1049 "S2\tCCCCCCCAAAAAAA\t\t".to_owned(),
1050 ];
1051 let tempdir = TempDir::new().unwrap();
1052 let f1 = write_metadata(&tempdir, &lines);
1053 let globals = vec![make_rs("3M7B+T"), make_rs("3M7B+T")];
1054 let group = SampleGroup::from_file(&f1, &globals).unwrap();
1055 assert!(group.samples[0].read_structures.is_some());
1056 assert!(group.samples[1].read_structures.is_none());
1057 }
1058
1059 #[test]
1061 fn test_per_sample_column_count_must_match_globals() {
1062 let lines = vec![
1063 "sample_id\tbarcode\tread_structure_1".to_owned(),
1064 "S1\tGATTACA\t3M7B1S+T".to_owned(),
1065 ];
1066 let tempdir = TempDir::new().unwrap();
1067 let f1 = write_metadata(&tempdir, &lines);
1068 let globals = vec![make_rs("3M7B+T"), make_rs("100T")];
1069 let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1070 let msg = format!("{err:#}");
1071 assert!(
1072 msg.contains("`read_structure_<n>` column(s)") && msg.contains("--read-structures"),
1073 "got: {msg}",
1074 );
1075 }
1076
1077 #[test]
1080 fn test_per_sample_variable_length_b_segment_errors() {
1081 let lines =
1082 vec!["sample_id\tbarcode\tread_structure_1".to_owned(), "S1\tGATTACA\t3M+B".to_owned()];
1083 let tempdir = TempDir::new().unwrap();
1084 let f1 = write_metadata(&tempdir, &lines);
1085 let globals = vec![make_rs("3M7B+T")];
1086 let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1087 let msg = format!("{err:#}");
1088 assert!(msg.contains("must be fixed length"), "got: {msg}");
1089 }
1090
1091 #[test]
1094 fn test_header_with_utf8_bom_is_handled() {
1095 let lines = vec![
1096 format!("\u{FEFF}{}", Sample::deserialize_header_line()),
1097 "sample1\tGATTACA".to_owned(),
1098 ];
1099 let tempdir = TempDir::new().unwrap();
1100 let f1 = write_metadata(&tempdir, &lines);
1101 let group = SampleGroup::from_file(&f1, &[]).unwrap();
1102 assert_eq!(group.samples[0].sample_id, "sample1");
1103 assert_eq!(group.samples[0].barcode, "GATTACA");
1104 }
1105
1106 #[test]
1109 fn test_rows_with_crlf_endings_are_handled() {
1110 let header = format!("{}\r", Sample::deserialize_header_line());
1111 let lines = vec![header, "sample1\tGATTACA\r".to_owned(), "sample2\tCATGCTA\r".to_owned()];
1112 let tempdir = TempDir::new().unwrap();
1113 let f1 = write_metadata(&tempdir, &lines);
1114 let group = SampleGroup::from_file(&f1, &[]).unwrap();
1115 assert_eq!(group.samples[0].barcode, "GATTACA");
1116 assert_eq!(group.samples[1].barcode, "CATGCTA");
1117 }
1118
1119 #[test]
1122 fn test_matching_prefix_lens_errors_on_rs_count_mismatch() {
1123 let s1 = sample_with_rs("S1", "GATTACAGGGGGGG", &["3M7B1S+T", "7B+T"]);
1125 let group = SampleGroup::from_samples(&[s1]).unwrap();
1126 let defaults = vec![make_rs("3M7B+T")];
1127 let err = group.matching_prefix_lens(&defaults).unwrap_err();
1128 let msg = format!("{err:#}");
1129 assert!(
1130 msg.contains("number of read structures") && msg.contains("number of inputs"),
1131 "got: {msg}",
1132 );
1133 }
1134
1135 #[test]
1137 fn test_per_sample_columns_must_be_contiguous() {
1138 let lines = vec![
1139 "sample_id\tbarcode\tread_structure_1\tread_structure_3".to_owned(),
1140 "S1\tGATTACA\t3M7B1S+T\t1S7B+T".to_owned(),
1141 ];
1142 let tempdir = TempDir::new().unwrap();
1143 let f1 = write_metadata(&tempdir, &lines);
1144 let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
1145 let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1146 let msg = format!("{err:#}");
1147 assert!(msg.contains("contiguous"), "got: {msg}");
1148 }
1149
1150 #[test]
1152 fn test_per_sample_columns_must_have_integer_suffix() {
1153 let lines = vec![
1154 "sample_id\tbarcode\tread_structure_abc".to_owned(),
1155 "S1\tGATTACA\t3M7B1S+T".to_owned(),
1156 ];
1157 let tempdir = TempDir::new().unwrap();
1158 let f1 = write_metadata(&tempdir, &lines);
1159 let globals = vec![make_rs("3M7B+T")];
1160 let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1161 let msg = format!("{err:#}");
1162 assert!(msg.contains("non-integer suffix"), "got: {msg}");
1163 }
1164
1165 #[test]
1167 fn test_per_sample_columns_must_be_one_indexed() {
1168 let lines = vec![
1169 "sample_id\tbarcode\tread_structure_0".to_owned(),
1170 "S1\tGATTACA\t3M7B1S+T".to_owned(),
1171 ];
1172 let tempdir = TempDir::new().unwrap();
1173 let f1 = write_metadata(&tempdir, &lines);
1174 let globals = vec![make_rs("3M7B+T")];
1175 let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1176 let msg = format!("{err:#}");
1177 assert!(msg.contains("1-based indexing"), "got: {msg}");
1178 }
1179}