Skip to main content

fqtk_lib/
samples.rs

1use crate::is_valid_iupac;
2
3use anyhow::{Context, Result, anyhow, bail, ensure};
4use fgoxide::io::Io;
5use itertools::Itertools;
6use read_structure::{ReadStructure, SegmentType};
7use std::fmt::{self, Display};
8use std::path::Path;
9use std::str::FromStr;
10
11const DEFAULT_FILE_DELIMETER: u8 = b'\t';
12const SAMPLE_ID_HEADER: &str = "sample_id";
13const BARCODE_HEADER: &str = "barcode";
14const READ_STRUCTURE_PREFIX: &str = "read_structure_";
15
16/// Struct for describing a single sample and metadata associated with that sample.
17#[derive(Clone, Debug, PartialEq)]
18pub struct Sample {
19    /// ID of the sample or library
20    pub sample_id: String,
21    /// DNA barcode associated with the sample
22    pub barcode: String,
23    /// Optional per-sample read structures (one per input FASTQ).  When present, these
24    /// override the global `--read-structures` for this sample, both for matching pattern
25    /// construction and for output extraction.
26    pub read_structures: Option<Vec<ReadStructure>>,
27    /// Index of the sample in the [`SampleGroup`] object, used for syncing indices across
28    /// different structs.
29    pub(crate) ordinal: usize,
30}
31
32impl Display for Sample {
33    /// Implements a nice format display for the [`Sample`] struct.
34    /// E.g. A sample with ordinal 2, name test-sample, and barcode GATTACA would look like:
35    /// Sample(0002) - { name: test-sample    barcode: GATTACA }
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        write!(
38            f,
39            "Sample({:04}) - {{ name: {}\tbarcode: {} }}",
40            self.ordinal, self.sample_id, self.barcode
41        )
42    }
43}
44
45impl Sample {
46    /// Validates inputs to generate a [`Self`] struct and instantiates the struct if they are
47    /// valid.
48    /// # Panics
49    ///   - Panics if sample name is empty string.
50    ///   - Panics if barcode is empty string.
51    ///   - Panics if barcode has bases other than A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, or
52    ///     N/n/.
53    #[must_use]
54    pub fn new(ordinal: usize, name: String, barcode: String) -> Self {
55        Self::with_read_structures(ordinal, name, barcode, None)
56    }
57
58    /// Like [`Sample::new`] but allows attaching per-sample read structures.
59    #[must_use]
60    pub fn with_read_structures(
61        ordinal: usize,
62        name: String,
63        barcode: String,
64        read_structures: Option<Vec<ReadStructure>>,
65    ) -> Self {
66        assert!(!name.is_empty(), "Sample name cannot be empty");
67        assert!(!barcode.is_empty(), "Sample barcode cannot be empty");
68        assert!(
69            barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
70            "All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
71        );
72        Self { sample_id: name, barcode, read_structures, ordinal }
73    }
74
75    /// Returns the header line expected by the metadata file deserializer.  Only the two required
76    /// columns are reported; per-sample `read_structure_<n>` columns are optional.
77    #[must_use]
78    pub fn deserialize_header_line() -> String {
79        format!("{SAMPLE_ID_HEADER}\t{BARCODE_HEADER}")
80    }
81}
82
83/// Struct for storing information about multiple samples and for defining functions associated
84/// with groups of [`Sample`]s, rather than individual structs.
85#[derive(Clone, Debug, PartialEq)]
86pub struct SampleGroup {
87    /// A group of samples
88    pub samples: Vec<Sample>,
89}
90
91impl Display for SampleGroup {
92    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93        writeln!(f, "SampleGroup {{")?;
94        for sample in &self.samples {
95            writeln!(f, "    {sample}")?;
96        }
97        writeln!(f, "}}")
98    }
99}
100
101impl SampleGroup {
102    /// Validates a group of [`Sample`]s and instantiates a [`Self`] struct if they are valid. Will
103    /// clone the [`Sample`] structs and change the number on the `ordinal` field on those cloned
104    /// to match the order in which they are stored in this [`Self`].
105    ///
106    /// Per-sample read structures may differ in their per-input `(T, B, M, C)` segment counts
107    /// across samples (which lets each sample produce a different set of output files), but
108    /// the total length of all sample-barcode (`B`) segments per sample must equal the
109    /// `barcode` column for that sample (and barcodes are required to be the same length).
110    ///
111    /// # Errors
112    ///   - Will error if no samples are provided.
113    ///   - Will error if there are duplicate sample names provided.
114    ///   - Will error if there are duplicate barcodes provided.
115    ///   - Will error if barcodes don't all have the same length.
116    ///   - Will error if any sample's per-sample sample-barcode (`B`) segment is not fixed length.
117    ///   - Will error if any sample's per-sample sample-barcode (`B`) segment lengths don't
118    ///     sum to that sample's `barcode` column length.
119    pub fn from_samples(samples: &[Sample]) -> Result<Self> {
120        ensure!(!samples.is_empty(), "Must provide one or more sample");
121
122        ensure!(
123            samples.iter().map(|s| &s.sample_id).all_unique(),
124            "Each sample name must be unique, duplicate identified"
125        );
126
127        ensure!(
128            samples.iter().map(|s| &s.barcode).all_unique(),
129            "Each sample barcode must be unique, duplicate identified",
130        );
131
132        let first_barcode_length = samples[0].barcode.len();
133        ensure!(
134            samples.iter().map(|s| &s.barcode).all(|b| b.len() == first_barcode_length),
135            "All barcodes must have the same length",
136        );
137
138        // Per-sample read structures (when present) must have B-segments whose lengths sum to
139        // the sample's `barcode` column length.  Sample-barcode segments must be fixed-length.
140        for sample in samples {
141            let Some(rs) = sample.read_structures.as_ref() else { continue };
142            let mut b_len: usize = 0;
143            for seg in rs.iter().flat_map(|r| r.segments_by_type(SegmentType::SampleBarcode)) {
144                let len = seg.length.ok_or_else(|| {
145                    anyhow!(
146                        "Sample {}: sample-barcode (B) segments in per-sample read structures \
147                         must be fixed length",
148                        sample.sample_id,
149                    )
150                })?;
151                b_len += len;
152            }
153            ensure!(
154                b_len == sample.barcode.len(),
155                "Sample {}: total sample-barcode (B) length across per-sample read structures \
156                 is {} but barcode column has {} bases",
157                sample.sample_id,
158                b_len,
159                sample.barcode.len(),
160            );
161        }
162
163        Ok(Self {
164            samples: samples
165                .iter()
166                .enumerate()
167                .map(|(ordinal, sample)| {
168                    Sample::with_read_structures(
169                        ordinal,
170                        sample.sample_id.clone(),
171                        sample.barcode.clone(),
172                        sample.read_structures.clone(),
173                    )
174                })
175                .collect(),
176        })
177    }
178
179    /// Attempts to load a [`Self`] object from a tab-delimited file.  The file must have a header
180    /// row containing at least the columns `sample_id` and `barcode`.  Optional additional columns
181    /// `read_structure_1`, `read_structure_2`, ..., `read_structure_<N>` attach per-sample read
182    /// structures to each sample (one per input FASTQ in the order given to `--inputs`).  When
183    /// these columns are present:
184    ///
185    /// - `globals` (the `--read-structures` argument) must have exactly `N` entries, one per
186    ///   input FASTQ.
187    /// - A blank cell in `read_structure_<i>` falls back to `globals[i-1]` for that sample.
188    /// - A row whose `read_structure_<n>` cells are all blank uses `globals` entirely (i.e. is
189    ///   equivalent to omitting the per-sample columns for that sample only).
190    ///
191    /// When no `read_structure_<n>` columns are present, `globals` is unused and every sample
192    /// uses the global structures.
193    ///
194    /// # Errors
195    ///   - Will error if the file cannot be read.
196    ///   - Will error if the header row is missing required columns.
197    ///   - Will error if `read_structure_<n>` columns are non-contiguous or don't match
198    ///     `globals.len()` (when present).
199    ///   - Will error if any `read_structure_<n>` cell has a value that fails to parse.
200    ///   - Will error if [`SampleGroup::from_samples`] rejects the parsed records (e.g. duplicate
201    ///     names/barcodes, inconsistent barcode lengths, or barcode/`B`-segment length mismatch).
202    pub fn from_file<P: AsRef<Path>>(path: P, globals: &[ReadStructure]) -> Result<SampleGroup> {
203        let path = path.as_ref();
204        let io = Io::default();
205        let lines = io
206            .read_lines(path)
207            .with_context(|| format!("failed to read sample metadata file {path:?}"))?;
208        let mut iter = lines.into_iter().filter(|l| !l.trim().is_empty());
209
210        let header = iter.next().ok_or_else(|| {
211            anyhow!(
212                "sample metadata file {path:?} is empty (expected header line {})",
213                Sample::deserialize_header_line(),
214            )
215        })?;
216        // Strip a UTF-8 BOM and any trailing CR so files saved on Windows or with a BOM
217        // (e.g. by Excel/Notepad) parse correctly.
218        let header = strip_bom_and_cr(&header);
219        let header_fields: Vec<&str> = header.split(DEFAULT_FILE_DELIMETER as char).collect();
220
221        let sample_id_idx =
222            header_fields.iter().position(|c| *c == SAMPLE_ID_HEADER).ok_or_else(|| {
223                anyhow!("sample metadata header is missing column `{SAMPLE_ID_HEADER}`")
224            })?;
225        let barcode_idx =
226            header_fields.iter().position(|c| *c == BARCODE_HEADER).ok_or_else(|| {
227                anyhow!("sample metadata header is missing column `{BARCODE_HEADER}`")
228            })?;
229
230        // Discover read_structure_<n> columns in header order; require contiguous indexing
231        // starting at 1 and a total count matching `globals.len()` (when present).
232        let mut rs_columns: Vec<(usize, usize)> = Vec::new(); // (n, header_idx)
233        for (idx, name) in header_fields.iter().enumerate() {
234            if let Some(suffix) = name.strip_prefix(READ_STRUCTURE_PREFIX) {
235                let n: usize = suffix
236                    .parse()
237                    .with_context(|| format!("metadata column `{name}` has non-integer suffix"))?;
238                ensure!(n >= 1, "metadata column `{name}` must use 1-based indexing");
239                rs_columns.push((n, idx));
240            }
241        }
242        rs_columns.sort_by_key(|(n, _)| *n);
243        for (i, (n, _)) in rs_columns.iter().enumerate() {
244            ensure!(
245                *n == i + 1,
246                "per-sample read structure columns must be contiguous starting at \
247                 `{READ_STRUCTURE_PREFIX}1` (found `{READ_STRUCTURE_PREFIX}{n}` at position {})",
248                i + 1,
249            );
250        }
251        if !rs_columns.is_empty() {
252            ensure!(
253                rs_columns.len() == globals.len(),
254                "metadata file has {} `{READ_STRUCTURE_PREFIX}<n>` column(s) but \
255                 `--read-structures` has {} entry/entries",
256                rs_columns.len(),
257                globals.len(),
258            );
259        }
260
261        let mut samples: Vec<Sample> = Vec::new();
262        for (line_no, line) in iter.enumerate() {
263            let row_no = line_no + 2; // header is line 1
264            let line = line.trim_end_matches('\r');
265            let cols: Vec<&str> = line.split(DEFAULT_FILE_DELIMETER as char).collect();
266            ensure!(
267                cols.len() == header_fields.len(),
268                "sample metadata row {row_no} has {} columns but header has {}",
269                cols.len(),
270                header_fields.len(),
271            );
272            let sample_id = cols[sample_id_idx].to_owned();
273            let barcode = cols[barcode_idx].to_owned();
274            let read_structures =
275                parse_per_sample_read_structures(row_no, &cols, &rs_columns, globals)?;
276            samples.push(Sample::with_read_structures(
277                samples.len(),
278                sample_id,
279                barcode,
280                read_structures,
281            ));
282        }
283
284        if samples.is_empty() {
285            bail!("sample metadata file {path:?} contained no sample rows");
286        }
287        Self::from_samples(&samples)
288    }
289
290    /// Returns true if this group has per-sample read structures (i.e. at least one sample
291    /// carries custom read structures that override `--read-structures`).
292    #[must_use]
293    pub fn has_per_sample_read_structures(&self) -> bool {
294        self.samples.iter().any(|s| s.read_structures.is_some())
295    }
296
297    /// Returns the per-input matching prefix lengths needed to demultiplex this sample group.
298    /// For each input FASTQ index, this is the maximum across samples of the number of bases
299    /// before the first Template segment in that sample's read structure.  Falls back to the
300    /// corresponding entry in `default_structures` for samples without per-sample read
301    /// structures.
302    ///
303    /// # Errors
304    ///   - Returns an error if any non-Template segment in the matching window has a
305    ///     non-fixed length.
306    ///   - Returns an error if a sample's per-sample read-structure count differs from
307    ///     `default_structures.len()`.
308    pub fn matching_prefix_lens(&self, default_structures: &[ReadStructure]) -> Result<Vec<usize>> {
309        let n = default_structures.len();
310        let mut maxes = vec![0usize; n];
311        for sample in &self.samples {
312            let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
313            ensure!(
314                rs_for_sample.len() == n,
315                "sample {}: number of read structures ({}) does not match number of inputs ({})",
316                sample.sample_id,
317                rs_for_sample.len(),
318                n,
319            );
320            for (i, rs) in rs_for_sample.iter().enumerate() {
321                let plen = pre_template_fixed_len(rs).with_context(|| {
322                    let source = if sample.read_structures.is_some() {
323                        format!("sample {}'s `read_structure_{}`", sample.sample_id, i + 1)
324                    } else {
325                        format!(
326                            "the `--read-structures` fallback for sample {} (input {})",
327                            sample.sample_id,
328                            i + 1,
329                        )
330                    };
331                    format!(
332                        "per-sample demultiplexing requires a fixed-length matching window, so \
333                         every segment before the template in {source} must have a fixed length"
334                    )
335                })?;
336                if plen > maxes[i] {
337                    maxes[i] = plen;
338                }
339            }
340        }
341        Ok(maxes)
342    }
343
344    /// Builds the matching pattern bytes for each sample.  The pattern is a fixed-length
345    /// byte sequence (concatenated across all input FASTQs) of total length
346    /// `prefix_lens.iter().sum()`.  For each input, the sample's read structure is walked
347    /// position-by-position; B (sample-barcode) segment positions are filled from
348    /// `sample.barcode` (taken in order across inputs), and M/S/C/T segment positions and any
349    /// trailing positions up to the prefix length are filled with `N` (treated as a wildcard
350    /// by the matcher).
351    ///
352    /// # Errors
353    ///   - Returns an error if any sample's read structure has a non-fixed-length non-template
354    ///     segment in the matching window.
355    ///   - Returns an error if a sample-barcode (`B`) segment crosses the matching window
356    ///     boundary, since silently truncating a barcode would corrupt matching.
357    pub fn build_matching_patterns(
358        &self,
359        default_structures: &[ReadStructure],
360        prefix_lens: &[usize],
361    ) -> Result<Vec<Vec<u8>>> {
362        ensure!(
363            default_structures.len() == prefix_lens.len(),
364            "expected one prefix length per input FASTQ"
365        );
366        let total_len: usize = prefix_lens.iter().sum();
367        let mut patterns = Vec::with_capacity(self.samples.len());
368        for sample in &self.samples {
369            let rs_for_sample = sample.read_structures.as_deref().unwrap_or(default_structures);
370            ensure!(
371                rs_for_sample.len() == prefix_lens.len(),
372                "sample {}: number of read structures ({}) does not match number of inputs ({})",
373                sample.sample_id,
374                rs_for_sample.len(),
375                prefix_lens.len(),
376            );
377
378            // All sample-barcode (B) segments must precede the template (T): a B positioned after
379            // the template falls outside the matching window and can never be used to assign the
380            // read, so reject it explicitly rather than failing later with a confusing count.
381            for (input_idx, rs) in rs_for_sample.iter().enumerate() {
382                let total_b = rs.segments_by_type(SegmentType::SampleBarcode).count();
383                let b_before_template = rs
384                    .iter()
385                    .take_while(|seg| seg.kind != SegmentType::Template)
386                    .filter(|seg| seg.kind == SegmentType::SampleBarcode)
387                    .count();
388                ensure!(
389                    total_b == b_before_template,
390                    "sample {}: all sample-barcode (B) segments must precede the template (T) in \
391                     read structure {} (input {})",
392                    sample.sample_id,
393                    rs,
394                    input_idx + 1,
395                );
396            }
397
398            // The effective read structures' total sample-barcode (B) length must equal the
399            // barcode column length.  This also validates global-only samples in mixed mode, whose
400            // effective structure is the `--read-structures` fallback (a case `from_samples` cannot
401            // check because it has no access to the global structures).
402            let expected_barcode_len: usize = rs_for_sample
403                .iter()
404                .flat_map(|rs| rs.segments_by_type(SegmentType::SampleBarcode))
405                .map(|seg| seg.length.unwrap_or(0))
406                .sum();
407            ensure!(
408                expected_barcode_len == sample.barcode.len(),
409                "sample {}: {}read structure(s) declare {} sample-barcode (B) base(s) but the \
410                 barcode column has {} base(s)",
411                sample.sample_id,
412                if sample.read_structures.is_none() {
413                    "(using --read-structures fallback) "
414                } else {
415                    ""
416                },
417                expected_barcode_len,
418                sample.barcode.len(),
419            );
420
421            let mut pattern = Vec::with_capacity(total_len);
422            let mut barcode_cursor = 0usize;
423            let barcode_bytes = sample.barcode.as_bytes();
424            for (rs, &prefix_len) in rs_for_sample.iter().zip(prefix_lens) {
425                let mut filled = 0usize;
426                for seg in rs.iter() {
427                    if seg.kind == SegmentType::Template {
428                        break;
429                    }
430                    let len = seg.length.ok_or_else(|| {
431                        anyhow!(
432                            "sample {}: non-template segment {seg} in read structure must have a \
433                             fixed length",
434                            sample.sample_id,
435                        )
436                    })?;
437                    let remaining = prefix_len - filled;
438                    let take = len.min(remaining);
439                    if seg.kind == SegmentType::SampleBarcode {
440                        ensure!(
441                            take == len,
442                            "sample {}: sample-barcode segment {seg} crosses the matching \
443                             window boundary (segment length {}, but only {} bases remain in \
444                             the {}-base window)",
445                            sample.sample_id,
446                            len,
447                            remaining,
448                            prefix_len,
449                        );
450                        ensure!(
451                            barcode_cursor + len <= barcode_bytes.len(),
452                            "sample {}: barcode ({} bases) is shorter than the total \
453                             sample-barcode length required by its read structure",
454                            sample.sample_id,
455                            barcode_bytes.len(),
456                        );
457                        pattern.extend_from_slice(
458                            &barcode_bytes[barcode_cursor..barcode_cursor + len],
459                        );
460                        barcode_cursor += len;
461                    } else {
462                        pattern.extend(std::iter::repeat_n(b'N', take));
463                    }
464                    filled += take;
465                    if take < len {
466                        break;
467                    }
468                }
469                if filled < prefix_len {
470                    pattern.extend(std::iter::repeat_n(b'N', prefix_len - filled));
471                }
472            }
473            ensure!(
474                barcode_cursor == sample.barcode.len(),
475                "sample {}: only consumed {} of {} barcode bases when building matching pattern",
476                sample.sample_id,
477                barcode_cursor,
478                sample.barcode.len(),
479            );
480            patterns.push(pattern);
481        }
482        Ok(patterns)
483    }
484}
485
486/// Resolves the per-sample read structure cells for one row of the metadata file.  Returns
487/// `None` if `rs_columns` is empty or every cell is blank (sample uses globals entirely);
488/// otherwise returns `Some(vec)` with each blank cell replaced by `globals[i]`.
489///
490/// Sample-barcode (`B`) segments in any parsed cell must be fixed length: a variable-length
491/// `+B` is rejected here so the error surfaces as a normal `Result` rather than panicking
492/// later during `from_samples` validation.
493fn parse_per_sample_read_structures(
494    row_no: usize,
495    cols: &[&str],
496    rs_columns: &[(usize, usize)],
497    globals: &[ReadStructure],
498) -> Result<Option<Vec<ReadStructure>>> {
499    if rs_columns.is_empty() {
500        return Ok(None);
501    }
502    let mut entries: Vec<Option<ReadStructure>> = Vec::with_capacity(rs_columns.len());
503    for (n, idx) in rs_columns {
504        let raw = cols[*idx].trim();
505        if raw.is_empty() {
506            entries.push(None);
507        } else {
508            let rs = ReadStructure::from_str(raw).with_context(|| {
509                format!(
510                    "sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}` has \
511                     invalid read structure `{raw}`",
512                )
513            })?;
514            for seg in rs.segments_by_type(SegmentType::SampleBarcode) {
515                ensure!(
516                    seg.length.is_some(),
517                    "sample metadata row {row_no} column `{READ_STRUCTURE_PREFIX}{n}`: \
518                     sample-barcode segment {seg} must be fixed length (variable-length `+B` \
519                     is not supported in per-sample read structures)",
520                );
521            }
522            entries.push(Some(rs));
523        }
524    }
525    if entries.iter().all(Option::is_none) {
526        return Ok(None);
527    }
528    let resolved: Vec<ReadStructure> = entries
529        .into_iter()
530        .enumerate()
531        .map(|(i, e)| e.unwrap_or_else(|| globals[i].clone()))
532        .collect();
533    Ok(Some(resolved))
534}
535
536/// Strips a leading UTF-8 byte-order mark and any trailing carriage return from a line so that
537/// files saved with a BOM or CRLF line endings parse correctly.
538fn strip_bom_and_cr(s: &str) -> &str {
539    s.strip_prefix('\u{FEFF}').unwrap_or(s).trim_end_matches('\r')
540}
541
542/// Returns the offset of the first Template segment in a read structure.  If the read structure
543/// has no Template segments, returns the sum of all (fixed-length) segment lengths.
544///
545/// # Errors
546///   - Returns an error if a non-Template segment lacks a fixed length.
547fn pre_template_fixed_len(rs: &ReadStructure) -> Result<usize> {
548    let mut len = 0;
549    for seg in rs.iter() {
550        if seg.kind == SegmentType::Template {
551            return Ok(len);
552        }
553        len += seg.length.ok_or_else(|| {
554            anyhow!("non-template segment {seg} in read structure {rs} must have a fixed length")
555        })?;
556    }
557    Ok(len)
558}
559
560#[cfg(test)]
561mod tests {
562    use super::*;
563    use fgoxide::io::Io;
564    use std::str::FromStr;
565    use tempfile::TempDir;
566
567    // ############################################################################################
568    // Test [`SampleGroup::from_file`] - Expected to pass
569    // ############################################################################################
570    #[test]
571    fn test_reading_from_tsv_file() {
572        let lines = vec![
573            Sample::deserialize_header_line(),
574            "sample1\tGATTACA".to_owned(),
575            "sample2\tCATGCTA".to_owned(),
576        ];
577        let tempdir = TempDir::new().unwrap();
578        let f1 = tempdir.path().join("sample_metadata.tsv");
579
580        let io = Io::default();
581        io.write_lines(&f1, &lines).unwrap();
582        let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
583
584        assert!(samples_metadata.samples[0].sample_id == "sample1");
585        assert!(samples_metadata.samples[1].sample_id == "sample2");
586        assert!(samples_metadata.samples[0].barcode == "GATTACA");
587        assert!(samples_metadata.samples[1].barcode == "CATGCTA");
588        assert!(!samples_metadata.has_per_sample_read_structures());
589    }
590
591    #[test]
592    fn test_reading_from_file_with_empty_lines_at_end() {
593        let lines = vec![
594            Sample::deserialize_header_line(),
595            "sample1\tGATTACA".to_owned(),
596            "sample2\tCATGCTA".to_owned(),
597            String::new(),
598            String::new(),
599        ];
600        let tempdir = TempDir::new().unwrap();
601        let f1 = tempdir.path().join("sample_metadata.tsv");
602
603        let io = Io::default();
604        io.write_lines(&f1, &lines).unwrap();
605        let samples_metadata = SampleGroup::from_file(&f1, &[]).unwrap();
606
607        assert!(samples_metadata.samples[0].sample_id == "sample1");
608        assert!(samples_metadata.samples[1].sample_id == "sample2");
609        assert!(samples_metadata.samples[0].barcode == "GATTACA");
610        assert!(samples_metadata.samples[1].barcode == "CATGCTA");
611    }
612
613    #[test]
614    fn test_new_sample_non_agct_bases_in_barcode_allowed() {
615        let name = "s_1_example_name".to_owned();
616        let barcode = "GATTANN".to_owned();
617        let ordinal = 0;
618        let _sample = Sample::new(ordinal, name, barcode);
619    }
620
621    #[test]
622    fn test_tsv_file_delim_error() {
623        let lines: Vec<String> = ["sample_id,barcode", "sample1,GATTACA", "sample2,CATGCTA"]
624            .iter()
625            .map(|&s| s.into())
626            .collect();
627        let tempdir = TempDir::new().unwrap();
628        let f1 = tempdir.path().join("sample_metadata.tsv");
629
630        let io = Io::default();
631        io.write_lines(&f1, &lines).unwrap();
632        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
633        let msg = format!("{err:#}");
634        assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
635    }
636
637    // ############################################################################################
638    // Test [`SampleGroup::from_file`] - Expected to error or panic
639    // ############################################################################################
640    #[test]
641    fn test_reading_from_file_with_no_header() {
642        let lines = vec!["sample1\tGATTACA", "sample2\tCATGCTA"];
643        let tempdir = TempDir::new().unwrap();
644        let f1 = tempdir.path().join("sample_metadata.tsv");
645
646        let io = Io::default();
647        io.write_lines(&f1, &lines).unwrap();
648        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
649        let msg = format!("{err:#}");
650        assert!(msg.contains("missing column `sample_id`"), "got: {msg}");
651    }
652
653    #[test]
654    fn test_reading_header_only_file() {
655        let lines = vec![Sample::deserialize_header_line()];
656        let tempdir = TempDir::new().unwrap();
657        let f1 = tempdir.path().join("sample_metadata.tsv");
658
659        let io = Io::default();
660        io.write_lines(&f1, &lines).unwrap();
661        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
662        let msg = format!("{err:#}");
663        assert!(msg.contains("contained no sample rows"), "got: {msg}");
664    }
665
666    #[test]
667    fn test_reading_empty_file() {
668        let lines = vec![""];
669        let tempdir = TempDir::new().unwrap();
670        let f1 = tempdir.path().join("sample_metadata.tsv");
671
672        let io = Io::default();
673        io.write_lines(&f1, &lines).unwrap();
674        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
675        let msg = format!("{err:#}");
676        assert!(msg.contains("is empty") || msg.contains("missing column"), "got: {msg}");
677    }
678
679    #[test]
680    fn test_reading_from_file_with_duplicate_barcodes_errors() {
681        // A malformed user TSV (duplicate barcodes) must return a clean error, not panic.
682        let lines = vec!["sample_id\tbarcode", "sample1\tGATTACA", "sample2\tGATTACA"];
683        let tempdir = TempDir::new().unwrap();
684        let f1 = tempdir.path().join("sample_metadata.tsv");
685
686        let io = Io::default();
687        io.write_lines(&f1, &lines).unwrap();
688        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
689        let msg = format!("{err:#}");
690        assert!(msg.contains("Each sample barcode must be unique"), "got: {msg}");
691    }
692
693    #[test]
694    fn test_reading_non_existent_file() {
695        let tempdir = TempDir::new().unwrap();
696        let f1 = tempdir.path().join("sample_metadata.tsv");
697        let err = SampleGroup::from_file(&f1, &[]).unwrap_err();
698        let msg = format!("{err:#}");
699        assert!(msg.contains("failed to read sample metadata file"), "got: {msg}");
700    }
701
702    // ############################################################################################
703    // Test [`Sample::new`] - Expected to pass
704    // ############################################################################################
705    #[test]
706    fn test_new_sample_success() {
707        let name = "s_1_example_name".to_owned();
708        let barcode = "GATTACA".to_owned();
709        let ordinal = 0;
710        let sample = Sample::new(ordinal, name.clone(), barcode.clone());
711        assert_eq!(
712            Sample { sample_id: name, barcode, read_structures: None, ordinal },
713            sample,
714            "Sample differed from expectation"
715        );
716    }
717
718    // ############################################################################################
719    // Test [`Sample::new`] - Expected to panic
720    // ############################################################################################
721    #[test]
722    #[should_panic(expected = "Sample name cannot be empty")]
723    fn test_new_sample_fail1_empty_sample_name() {
724        let name = String::new();
725        let barcode = "GATTACA".to_owned();
726        let ordinal = 0;
727        let _sample = Sample::new(ordinal, name, barcode);
728    }
729
730    #[test]
731    #[should_panic(expected = "Sample barcode cannot be empty")]
732    fn test_new_sample_fail2_empty_barcode() {
733        let name = "s_1_example_name".to_owned();
734        let barcode = String::new();
735        let ordinal = 0;
736        let _sample = Sample::new(ordinal, name, barcode);
737    }
738
739    // ############################################################################################
740    // Test [`SampleGroup::from_samples`] - expected to pass
741    // ############################################################################################
742    #[test]
743    fn test_from_samples_sample_group_pass1_single_sample() {
744        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
745        let samples_vec = vec![sample1.clone()];
746        let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
747
748        assert_eq!(sample_group, SampleGroup { samples: vec![sample1] });
749    }
750
751    #[test]
752    fn test_from_samples_sample_group_pass2_multi_unique_samples() {
753        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
754        let sample2 = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
755        let samples_vec = vec![sample1.clone(), sample2.clone()];
756        let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
757
758        assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2] });
759    }
760
761    #[test]
762    fn test_from_samples_sample_group_pass3_ordinal_values_will_be_changed_by_new() {
763        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
764        let sample2_before = Sample::new(2, "sample_2".to_owned(), "CATGGAT".to_owned());
765        let sample2_after = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
766        let samples_vec = vec![sample1.clone(), sample2_before];
767        let sample_group = SampleGroup::from_samples(&samples_vec).unwrap();
768
769        assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2_after] });
770    }
771
772    // ############################################################################################
773    // Test [`SampleGroup::from_samples`] - expected to error
774    // ############################################################################################
775    #[test]
776    fn test_from_samples_sample_group_fail1_no_samples() {
777        let samples = vec![];
778        let err = SampleGroup::from_samples(&samples).unwrap_err();
779        assert!(err.to_string().contains("Must provide one or more sample"), "got: {err:#}");
780    }
781
782    #[test]
783    fn test_from_samples_sample_group_fail2_duplicate_sample_names() {
784        let samples = vec![
785            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
786            Sample::new(0, "sample_1".to_owned(), "CATGGAT".to_owned()),
787        ];
788        let err = SampleGroup::from_samples(&samples).unwrap_err();
789        assert!(
790            err.to_string().contains("Each sample name must be unique, duplicate identified"),
791            "got: {err:#}",
792        );
793    }
794
795    #[test]
796    fn test_from_samples_sample_group_fail3_duplicate_barcodes() {
797        let samples = vec![
798            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
799            Sample::new(0, "sample_2".to_owned(), "GATTACA".to_owned()),
800        ];
801        let err = SampleGroup::from_samples(&samples).unwrap_err();
802        assert!(
803            err.to_string().contains("Each sample barcode must be unique, duplicate identified"),
804            "got: {err:#}",
805        );
806    }
807
808    #[test]
809    fn test_from_samples_sample_group_fail4_barcodes_of_different_lengths() {
810        let samples = vec![
811            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
812            Sample::new(0, "sample_2".to_owned(), "CATGGA".to_owned()),
813        ];
814        let err = SampleGroup::from_samples(&samples).unwrap_err();
815        assert!(err.to_string().contains("All barcodes must have the same length"), "got: {err:#}");
816    }
817
818    // ############################################################################################
819    // Tests for per-sample read structures.
820    // ############################################################################################
821    fn make_rs(s: &str) -> ReadStructure {
822        ReadStructure::from_str(s).unwrap()
823    }
824
825    fn sample_with_rs(name: &str, barcode: &str, structures: &[&str]) -> Sample {
826        let rs = structures.iter().map(|s| make_rs(s)).collect();
827        Sample::with_read_structures(0, name.to_owned(), barcode.to_owned(), Some(rs))
828    }
829
830    fn write_metadata(tempdir: &TempDir, lines: &[String]) -> std::path::PathBuf {
831        let f1 = tempdir.path().join("metadata.tsv");
832        Io::default().write_lines(&f1, lines).unwrap();
833        f1
834    }
835
836    #[test]
837    fn test_per_sample_read_structures_round_trip_via_metadata() {
838        // Two-input case: each input contributes 7B, so the barcode column carries 14 bases.
839        let lines = vec![
840            "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
841            "S1\tGATTACAACGTACG\t3M7B1S+T\t3M7B1S+T".to_owned(),
842            "S2\tGGGGGGGTTTTTTT\t3M1S7B1S+T\t3M1S7B1S+T".to_owned(),
843        ];
844        let tempdir = TempDir::new().unwrap();
845        let f1 = write_metadata(&tempdir, &lines);
846        let globals = vec![make_rs("3M9B+T"), make_rs("9B+T")];
847        let group = SampleGroup::from_file(&f1, &globals).unwrap();
848        assert!(group.has_per_sample_read_structures());
849        let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
850        assert_eq!(s1_rs.len(), 2);
851        let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
852        assert_eq!(s2_rs.len(), 2);
853    }
854
855    /// Per-sample read structures may have differing per-input `(T, B, M, C)` segment counts
856    /// across samples, as long as each sample's B-segment lengths sum to its `barcode` column.
857    #[test]
858    fn test_per_sample_read_structures_signatures_may_differ_across_samples() {
859        // S1 has one B-segment of length 14; S2 has two B-segments (7+7) of total length 14.
860        let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M14B+T"]);
861        let s2 = sample_with_rs("S2", "TTTTTTTGGGGGGG", &["3M7B7B+T"]);
862        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
863        assert!(group.has_per_sample_read_structures());
864    }
865
866    /// Mixed mode: some samples have per-sample read structures, others fall back to globals.
867    #[test]
868    fn test_per_sample_read_structures_mixed_with_global_only_samples() {
869        let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
870        let s2 = Sample::new(0, "S2".to_owned(), "CCCCCCC".to_owned());
871        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
872        assert!(group.has_per_sample_read_structures());
873        assert!(group.samples[0].read_structures.is_some());
874        assert!(group.samples[1].read_structures.is_none());
875    }
876
877    #[test]
878    fn test_per_sample_read_structures_barcode_length_mismatch() {
879        // S1 declares 7B + 7B = 14 bases of barcode but provides only 7 in the column.
880        let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T", "3M7B1S+T"]);
881        let err = SampleGroup::from_samples(&[s1]).unwrap_err();
882        assert!(err.to_string().contains("barcode column has"), "got: {err:#}");
883    }
884
885    #[test]
886    fn test_matching_prefix_lens_uses_max_across_samples() {
887        // S1 prefix per input: 3+7+1 = 11; S2 prefix: 3+1+7+1 = 12
888        let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
889        let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
890        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
891        let defaults = vec![make_rs("3M9B+T")];
892        let lens = group.matching_prefix_lens(&defaults).unwrap();
893        assert_eq!(lens, vec![12]);
894    }
895
896    #[test]
897    fn test_build_matching_patterns_codec_two_samples() {
898        let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
899        let s2 = sample_with_rs("S2", "GGGGGGG", &["3M1S7B1S+T"]);
900        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
901        let defaults = vec![make_rs("3M9B+T")];
902        let lens = group.matching_prefix_lens(&defaults).unwrap();
903        let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
904        // Total length is 12 for each pattern.
905        // S1: NNN + GATTACA + N (S=skip wildcarded) + N (padding to 12)
906        assert_eq!(patterns[0], b"NNNGATTACANN");
907        // S2: NNN + N (S=stagger wildcarded) + GGGGGGG + N (S=trailing wildcarded)
908        assert_eq!(patterns[1], b"NNNNGGGGGGGN");
909    }
910
911    #[test]
912    fn test_build_matching_patterns_falls_back_to_defaults_when_no_per_sample() {
913        let s1 = Sample::new(0, "S1".to_owned(), "GATTACA".to_owned());
914        let group = SampleGroup::from_samples(&[s1]).unwrap();
915        let defaults = vec![make_rs("3M7B1S+T")];
916        let lens = group.matching_prefix_lens(&defaults).unwrap();
917        assert_eq!(lens, vec![11]);
918        let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
919        assert_eq!(patterns[0], b"NNNGATTACAN");
920    }
921
922    #[test]
923    fn test_build_matching_patterns_dual_input_concatenated() {
924        // Two inputs, each with its own staggered structure; barcode column concatenates the
925        // two B-segments left-to-right across inputs.
926        let s1 = sample_with_rs("S1", "GATTACAACGTACG", &["3M7B1S+T", "7B+T"]);
927        let s2 = sample_with_rs("S2", "GGGGGGGTTTTTTT", &["3M1S7B1S+T", "1S7B+T"]);
928        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
929        let defaults = vec![make_rs("3M9B+T"), make_rs("9B+T")];
930        let lens = group.matching_prefix_lens(&defaults).unwrap();
931        // Input 1: max(11, 12) = 12; Input 2: max(7, 8) = 8.
932        assert_eq!(lens, vec![12, 8]);
933        let patterns = group.build_matching_patterns(&defaults, &lens).unwrap();
934        // Total pattern length: 12 + 8 = 20.
935        let mut expected_s1 = b"NNNGATTACANN".to_vec();
936        expected_s1.extend_from_slice(b"ACGTACGN");
937        assert_eq!(patterns[0], expected_s1);
938        let mut expected_s2 = b"NNNNGGGGGGGN".to_vec();
939        expected_s2.extend_from_slice(b"NTTTTTTT");
940        assert_eq!(patterns[1], expected_s2);
941    }
942
943    /// A sample-barcode segment crossing the matching window is rejected with a clear error
944    /// rather than silently truncated.
945    #[test]
946    fn test_build_matching_patterns_b_segment_crossing_window_errors() {
947        let s1 = sample_with_rs("S1", "GATTACA", &["3M7B1S+T"]);
948        let group = SampleGroup::from_samples(&[s1]).unwrap();
949        let defaults = vec![make_rs("3M7B1S+T")];
950        let lens = vec![5usize]; // forced narrow window for the test
951        let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
952        let msg = format!("{err:#}");
953        assert!(msg.contains("crosses the matching window boundary"), "got: {msg}");
954    }
955
956    /// A `SampleGroup` constructed directly (bypassing `from_samples`' barcode-length
957    /// validation) whose barcode is shorter than its read structure's sample-barcode
958    /// segments surfaces a clean error naming the declared vs. provided barcode lengths.
959    #[test]
960    fn test_build_matching_patterns_barcode_shorter_than_b_segments_errors() {
961        // `3M7B1S+T` requires 7 barcode bases, but the barcode column only has 3.
962        let s1 = sample_with_rs("S1", "GAT", &["3M7B1S+T"]);
963        let group = SampleGroup { samples: vec![s1] };
964        let defaults = vec![make_rs("3M7B1S+T")];
965        let lens = vec![11usize];
966        let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
967        let msg = format!("{err:#}");
968        assert!(
969            msg.contains("declare 7 sample-barcode (B) base(s) but the barcode column has 3"),
970            "got: {msg}",
971        );
972    }
973
974    /// A sample-barcode (B) segment positioned after the template is rejected with a clear
975    /// message rather than the cryptic "only consumed N of M barcode bases".
976    #[test]
977    fn test_build_matching_patterns_barcode_after_template_errors() {
978        // 4B10T8B: barcode segments straddle the template; the trailing 8B can never match.
979        let s1 = sample_with_rs("S1", "ACGTACGTACGT", &["4B10T8B"]);
980        let group = SampleGroup::from_samples(&[s1]).unwrap();
981        let defaults = vec![make_rs("4B10T8B")];
982        let lens = group.matching_prefix_lens(&defaults).unwrap();
983        let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
984        let msg = format!("{err:#}");
985        assert!(msg.contains("must precede the template"), "got: {msg}");
986    }
987
988    /// A global-only sample (mixed mode) whose barcode length doesn't match the global
989    /// `--read-structures` B-length is rejected with a message pointing at the fallback.
990    #[test]
991    fn test_build_matching_patterns_global_only_barcode_mismatch_errors() {
992        // S1 (per-sample 6B) and S2 (global-only) both have 6-base barcodes, but the global
993        // structure declares 7 B bases, so S2's effective structure doesn't match its barcode.
994        let s1 = sample_with_rs("S1", "GATTAC", &["3M6B1S+T"]);
995        let s2 = Sample::new(0, "S2".to_owned(), "CCCCCC".to_owned());
996        let group = SampleGroup::from_samples(&[s1, s2]).unwrap();
997        let defaults = vec![make_rs("3M7B1S+T")];
998        let lens = group.matching_prefix_lens(&defaults).unwrap();
999        let err = group.build_matching_patterns(&defaults, &lens).unwrap_err();
1000        let msg = format!("{err:#}");
1001        assert!(msg.contains("--read-structures fallback") && msg.contains("S2"), "got: {msg}",);
1002    }
1003
1004    /// In per-sample mode every input's pre-template segments must be fixed length; a variable
1005    /// pre-template segment yields a message explaining the fixed-window requirement.
1006    #[test]
1007    fn test_matching_prefix_lens_variable_pre_template_errors() {
1008        let s1 = sample_with_rs("S1", "ACGTACGT", &["8B+M"]);
1009        let group = SampleGroup::from_samples(&[s1]).unwrap();
1010        let defaults = vec![make_rs("8B+M")];
1011        let err = group.matching_prefix_lens(&defaults).unwrap_err();
1012        let msg = format!("{err:#}");
1013        assert!(msg.contains("fixed-length matching window"), "got: {msg}");
1014    }
1015
1016    // ############################################################################################
1017    // Per-cell fallback tests for `from_file`.
1018    // ############################################################################################
1019    /// A blank `read_structure_<n>` cell falls back to `globals[n-1]` for that sample.
1020    #[test]
1021    fn test_per_cell_fallback_uses_globals_for_blank_cells() {
1022        let lines = vec![
1023            "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
1024            // S1 overrides only input 1; input 2 is blank → uses globals[1].
1025            "S1\tGATTACAGGGGGGG\t3M7B1S+T\t".to_owned(),
1026            // S2 overrides only input 2; input 1 is blank → uses globals[0].
1027            "S2\tCCCCCCCAAAAAAA\t\t1S7B+T".to_owned(),
1028        ];
1029        let tempdir = TempDir::new().unwrap();
1030        let f1 = write_metadata(&tempdir, &lines);
1031        let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
1032        let group = SampleGroup::from_file(&f1, &globals).unwrap();
1033        let s1_rs = group.samples[0].read_structures.as_ref().unwrap();
1034        assert_eq!(s1_rs.len(), 2);
1035        assert_eq!(s1_rs[0].to_string(), "3M7B1S+T");
1036        assert_eq!(s1_rs[1].to_string(), "7B+T");
1037        let s2_rs = group.samples[1].read_structures.as_ref().unwrap();
1038        assert_eq!(s2_rs[0].to_string(), "3M7B+T");
1039        assert_eq!(s2_rs[1].to_string(), "1S7B+T");
1040    }
1041
1042    /// A row whose `read_structure_<n>` cells are all blank uses globals entirely (the
1043    /// sample's stored `read_structures` is `None`).
1044    #[test]
1045    fn test_per_cell_all_blank_row_falls_back_to_globals_entirely() {
1046        let lines = vec![
1047            "sample_id\tbarcode\tread_structure_1\tread_structure_2".to_owned(),
1048            "S1\tGATTACAGGGGGGG\t3M7B1S+T\t3M7B1S+T".to_owned(),
1049            "S2\tCCCCCCCAAAAAAA\t\t".to_owned(),
1050        ];
1051        let tempdir = TempDir::new().unwrap();
1052        let f1 = write_metadata(&tempdir, &lines);
1053        let globals = vec![make_rs("3M7B+T"), make_rs("3M7B+T")];
1054        let group = SampleGroup::from_file(&f1, &globals).unwrap();
1055        assert!(group.samples[0].read_structures.is_some());
1056        assert!(group.samples[1].read_structures.is_none());
1057    }
1058
1059    /// `read_structure_<n>` column count must match `globals.len()` when columns are present.
1060    #[test]
1061    fn test_per_sample_column_count_must_match_globals() {
1062        let lines = vec![
1063            "sample_id\tbarcode\tread_structure_1".to_owned(),
1064            "S1\tGATTACA\t3M7B1S+T".to_owned(),
1065        ];
1066        let tempdir = TempDir::new().unwrap();
1067        let f1 = write_metadata(&tempdir, &lines);
1068        let globals = vec![make_rs("3M7B+T"), make_rs("100T")];
1069        let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1070        let msg = format!("{err:#}");
1071        assert!(
1072            msg.contains("`read_structure_<n>` column(s)") && msg.contains("--read-structures"),
1073            "got: {msg}",
1074        );
1075    }
1076
1077    /// A variable-length sample-barcode (`+B`) in a per-sample column must surface as a
1078    /// normal `Result` error rather than panic during downstream validation.
1079    #[test]
1080    fn test_per_sample_variable_length_b_segment_errors() {
1081        let lines =
1082            vec!["sample_id\tbarcode\tread_structure_1".to_owned(), "S1\tGATTACA\t3M+B".to_owned()];
1083        let tempdir = TempDir::new().unwrap();
1084        let f1 = write_metadata(&tempdir, &lines);
1085        let globals = vec![make_rs("3M7B+T")];
1086        let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1087        let msg = format!("{err:#}");
1088        assert!(msg.contains("must be fixed length"), "got: {msg}");
1089    }
1090
1091    /// A UTF-8 BOM on the first header field should not cause a confusing
1092    /// "missing column `sample_id`" error.
1093    #[test]
1094    fn test_header_with_utf8_bom_is_handled() {
1095        let lines = vec![
1096            format!("\u{FEFF}{}", Sample::deserialize_header_line()),
1097            "sample1\tGATTACA".to_owned(),
1098        ];
1099        let tempdir = TempDir::new().unwrap();
1100        let f1 = write_metadata(&tempdir, &lines);
1101        let group = SampleGroup::from_file(&f1, &[]).unwrap();
1102        assert_eq!(group.samples[0].sample_id, "sample1");
1103        assert_eq!(group.samples[0].barcode, "GATTACA");
1104    }
1105
1106    /// CRLF line endings on a row should not cause the trailing `\r` to bleed into the
1107    /// barcode column and break downstream validation.
1108    #[test]
1109    fn test_rows_with_crlf_endings_are_handled() {
1110        let header = format!("{}\r", Sample::deserialize_header_line());
1111        let lines = vec![header, "sample1\tGATTACA\r".to_owned(), "sample2\tCATGCTA\r".to_owned()];
1112        let tempdir = TempDir::new().unwrap();
1113        let f1 = write_metadata(&tempdir, &lines);
1114        let group = SampleGroup::from_file(&f1, &[]).unwrap();
1115        assert_eq!(group.samples[0].barcode, "GATTACA");
1116        assert_eq!(group.samples[1].barcode, "CATGCTA");
1117    }
1118
1119    /// `matching_prefix_lens` must error (not panic) when a sample's per-sample read
1120    /// structure count differs from `default_structures.len()`.
1121    #[test]
1122    fn test_matching_prefix_lens_errors_on_rs_count_mismatch() {
1123        // Sample declares two read structures; defaults provide only one.
1124        let s1 = sample_with_rs("S1", "GATTACAGGGGGGG", &["3M7B1S+T", "7B+T"]);
1125        let group = SampleGroup::from_samples(&[s1]).unwrap();
1126        let defaults = vec![make_rs("3M7B+T")];
1127        let err = group.matching_prefix_lens(&defaults).unwrap_err();
1128        let msg = format!("{err:#}");
1129        assert!(
1130            msg.contains("number of read structures") && msg.contains("number of inputs"),
1131            "got: {msg}",
1132        );
1133    }
1134
1135    /// Non-contiguous `read_structure_<n>` columns must error.
1136    #[test]
1137    fn test_per_sample_columns_must_be_contiguous() {
1138        let lines = vec![
1139            "sample_id\tbarcode\tread_structure_1\tread_structure_3".to_owned(),
1140            "S1\tGATTACA\t3M7B1S+T\t1S7B+T".to_owned(),
1141        ];
1142        let tempdir = TempDir::new().unwrap();
1143        let f1 = write_metadata(&tempdir, &lines);
1144        let globals = vec![make_rs("3M7B+T"), make_rs("7B+T")];
1145        let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1146        let msg = format!("{err:#}");
1147        assert!(msg.contains("contiguous"), "got: {msg}");
1148    }
1149
1150    /// A non-integer suffix on a `read_structure_<n>` column must error.
1151    #[test]
1152    fn test_per_sample_columns_must_have_integer_suffix() {
1153        let lines = vec![
1154            "sample_id\tbarcode\tread_structure_abc".to_owned(),
1155            "S1\tGATTACA\t3M7B1S+T".to_owned(),
1156        ];
1157        let tempdir = TempDir::new().unwrap();
1158        let f1 = write_metadata(&tempdir, &lines);
1159        let globals = vec![make_rs("3M7B+T")];
1160        let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1161        let msg = format!("{err:#}");
1162        assert!(msg.contains("non-integer suffix"), "got: {msg}");
1163    }
1164
1165    /// A zero-indexed `read_structure_0` column must error since indexing is 1-based.
1166    #[test]
1167    fn test_per_sample_columns_must_be_one_indexed() {
1168        let lines = vec![
1169            "sample_id\tbarcode\tread_structure_0".to_owned(),
1170            "S1\tGATTACA\t3M7B1S+T".to_owned(),
1171        ];
1172        let tempdir = TempDir::new().unwrap();
1173        let f1 = write_metadata(&tempdir, &lines);
1174        let globals = vec![make_rs("3M7B+T")];
1175        let err = SampleGroup::from_file(&f1, &globals).unwrap_err();
1176        let msg = format!("{err:#}");
1177        assert!(msg.contains("1-based indexing"), "got: {msg}");
1178    }
1179}