fqtk_lib/
samples.rs

1use super::is_valid_base;
2use anyhow::Result;
3use fgoxide::io::DelimFile;
4use itertools::Itertools;
5use serde::Deserialize;
6use serde_aux::prelude::*;
7use std::collections::hash_map::RandomState;
8use std::collections::HashSet;
9use std::fmt::{self, Display};
10use std::path::Path;
11
12const DEFAULT_FILE_DELIMETER: u8 = b'\t';
13
14/// Struct for describing a single sample and metadata associated with that sample.
15#[derive(Clone, Deserialize, Debug, PartialEq, Eq)]
16pub struct Sample {
17    /// ID of the sample or library
18    pub sample_id: String,
19    /// DNA barcode associated with the sample
20    pub barcode: String,
21    /// index of the sample in the [`SampleGroup`] object, used for syncing indices across
22    /// different structs
23    #[serde(skip_deserializing)]
24    ordinal: usize,
25}
26
27impl Display for Sample {
28    /// Implements a nice format display for the [`Sample`] struct.
29    /// E.g. A sample with ordinal 2, name test-sample, and barcode GATTACA would look like:
30    /// Sample(0002) - { name: test-sample    barcode: GATTACA}
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        write!(
33            f,
34            "Sample({:04}) - {{ name: {}\tbarcode: {} }}",
35            self.ordinal, self.sample_id, self.barcode
36        )
37    }
38}
39
40impl Sample {
41    /// Validates inputs to generate a [`Self`] struct and instantiates the struct if they are
42    /// valid.
43    /// # Panics
44    ///   - Panics if sample name is empty string.
45    ///   - Panics if barcode is empty string.
46    ///   - Panics if barcode has bases other than A, C, G, or T.
47    #[must_use]
48    pub fn new(ordinal: usize, name: String, barcode: String) -> Self {
49        assert!(!name.is_empty(), "Sample name cannot be empty");
50        assert!(!barcode.is_empty(), "Sample barcode cannot be empty");
51        assert!(
52            barcode.as_bytes().iter().all(|&b| is_valid_base(b)),
53            "All sample barcode bases must be one of A, C, G, or T"
54        );
55        Self { sample_id: name, barcode, ordinal }
56    }
57
58    /// Returns the header line expected by serde when deserializing
59    #[must_use]
60    pub fn deserialize_header_line() -> String {
61        let field_names = serde_introspect::<Self>();
62        let skip_deserialize_fields: HashSet<&str, RandomState> = HashSet::from_iter(["ordinal"]);
63        let final_field_names: Vec<String> = field_names
64            .iter()
65            .filter(|&&f| !skip_deserialize_fields.contains(f))
66            .map(|&f| f.to_owned())
67            .collect();
68        final_field_names.join("\t")
69    }
70}
71
72/// Struct for storing information about multiple samples and for defining functions associated
73/// with groups of [`Sample`]s, rather than individual structs.
74#[derive(Clone, Debug, PartialEq, Eq)]
75pub struct SampleGroup {
76    /// A group of samples
77    pub samples: Vec<Sample>,
78}
79
80impl Display for SampleGroup {
81    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82        writeln!(f, "SampleGroup {{")?;
83        for sample in &self.samples {
84            writeln!(f, "    {}", sample)?;
85        }
86        writeln!(f, "}}")
87    }
88}
89
90impl SampleGroup {
91    /// Validates a group of [`Sample`]s and instantiates a [`Self`] struct if they are
92    /// valid. Will clone the [`Sample`] structs and change the number on the `ordinal` field on
93    /// those cloneto match the order in which they are stored in this [`Self`]
94    /// # Panics
95    ///   - Will panic if sample metadata sheet is improperly formatted
96    ///   - Will panic if there are duplicate sample names provided
97    ///   - Will panic if there are duplicate barcodes provided
98    ///   - Will panic if barcodes don't all have the same length
99    #[must_use]
100    pub fn from_samples(samples: &[Sample]) -> Self {
101        // Validate that we have at least one name
102        assert!(!samples.is_empty(), "Must provide one or more sample");
103
104        // Validate that all the sample names are unique
105        assert!(
106            samples.iter().map(|s| &s.sample_id).all_unique(),
107            "Each sample name must be unique, duplicate identified"
108        );
109
110        // Validate that the barcodes are all unique
111        assert!(
112            samples.iter().map(|s| &s.barcode).all_unique(),
113            "Each sample barcode must be unique, duplicate identified",
114        );
115
116        // Validate that the barcodes are all of the same length
117        let first_barcode_length = samples[0].barcode.len();
118        assert!(
119            samples.iter().map(|s| &s.barcode).all(|b| b.len() == first_barcode_length),
120            "All barcodes must have the same length",
121        );
122
123        Self {
124            samples: samples
125                .iter()
126                .enumerate()
127                .map(|(ordinal, sample)| {
128                    Sample::new(ordinal, sample.sample_id.clone(), sample.barcode.clone())
129                })
130                .collect(),
131        }
132    }
133
134    /// Attempts to load a [`Self`] object from a file. File should be delimeted with
135    /// `delimiter`, should have a header with `name` and `barcode` fields present.
136    /// # Errors
137    ///   - Will error if file cannot be read, either due to not the file not existing or due to
138    ///     the format being different from the format expected.
139    /// # Panics
140    ///   - Will panic if sample metadata sheet is improperly formatted
141    ///   - Will panic if a different number of names and barcodes are provided
142    ///   - Will panic if each
143    pub fn from_file<P: AsRef<Path>>(path: &P) -> Result<SampleGroup, fgoxide::FgError> {
144        let reader = DelimFile::default();
145        Ok(Self::from_samples(&reader.read(path, DEFAULT_FILE_DELIMETER, false)?))
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use core::panic;
152
153    use super::*;
154    use csv::DeserializeErrorKind as CsvDeserializeErrorEnum;
155    use csv::ErrorKind as CsvErrorEnum;
156    use fgoxide::{self, io::Io};
157    use serde::de::value::Error as SerdeError;
158    use serde::de::Error;
159    use tempfile::TempDir;
160
161    // ############################################################################################
162    // Test [`SampleGroup::from_file`] - Expected to pass
163    // ############################################################################################
164    #[test]
165    fn test_reading_from_tsv_file() {
166        let lines = vec![
167            Sample::deserialize_header_line(),
168            "sample1\tGATTACA".to_owned(),
169            "sample2\tCATGCTA".to_owned(),
170        ];
171        let tempdir = TempDir::new().unwrap();
172        let f1 = tempdir.path().join("sample_metadata.tsv");
173
174        let io = Io::default();
175        io.write_lines(&f1, &lines).unwrap();
176        let samples_metadata = SampleGroup::from_file(&f1).unwrap();
177
178        assert!(samples_metadata.samples[0].sample_id == "sample1");
179        assert!(samples_metadata.samples[1].sample_id == "sample2");
180        assert!(samples_metadata.samples[0].barcode == "GATTACA");
181        assert!(samples_metadata.samples[1].barcode == "CATGCTA");
182    }
183
184    #[test]
185    fn test_reading_from_file_with_empty_lines_at_end() {
186        let lines = vec![
187            Sample::deserialize_header_line(),
188            "sample1\tGATTACA".to_owned(),
189            "sample2\tCATGCTA".to_owned(),
190            String::new(),
191            String::new(),
192        ];
193        let tempdir = TempDir::new().unwrap();
194        let f1 = tempdir.path().join("sample_metadata.tsv");
195
196        let io = Io::default();
197        io.write_lines(&f1, &lines).unwrap();
198        let samples_metadata = SampleGroup::from_file(&f1).unwrap();
199
200        assert!(samples_metadata.samples[0].sample_id == "sample1");
201        assert!(samples_metadata.samples[1].sample_id == "sample2");
202        assert!(samples_metadata.samples[0].barcode == "GATTACA");
203        assert!(samples_metadata.samples[1].barcode == "CATGCTA");
204    }
205
206    #[test]
207    fn test_new_sample_non_agct_bases_in_barcode_allowed() {
208        let name = "s_1_example_name".to_owned();
209        let barcode = "GATTANN".to_owned();
210        let ordinal = 0;
211        let _sample = Sample::new(ordinal, name, barcode);
212    }
213
214    // ############################################################################################
215    // Test [`SampleGroup::from_file`] - Expected to panic
216    // ############################################################################################
217    #[test]
218    fn test_reading_from_file_with_no_header() {
219        let lines = vec!["sample1\tGATTACA", "sample2\tCATGCTA"];
220        let tempdir = TempDir::new().unwrap();
221        let f1 = tempdir.path().join("sample_metadata.tsv");
222
223        let io = Io::default();
224        io.write_lines(&f1, &lines).unwrap();
225        let mut to_panic = true;
226        if let fgoxide::FgError::ConversionError(csv_e) = SampleGroup::from_file(&f1).unwrap_err() {
227            if let CsvErrorEnum::Deserialize { pos: _, err: csv_de_err } = csv_e.into_kind() {
228                if let CsvDeserializeErrorEnum::Message(s) = csv_de_err.kind() {
229                    to_panic = false;
230                    assert_eq!(s, &SerdeError::missing_field("sample_id").to_string());
231                }
232            }
233        }
234        assert!(!to_panic, "Different error type than expected reading from headerless file.");
235    }
236
237    #[test]
238    #[should_panic(expected = "Must provide one or more sample")]
239    fn test_reading_header_only_file() {
240        let lines = vec![Sample::deserialize_header_line()];
241        let tempdir = TempDir::new().unwrap();
242        let f1 = tempdir.path().join("sample_metadata.tsv");
243
244        let io = Io::default();
245        io.write_lines(&f1, &lines).unwrap();
246        let _sm = SampleGroup::from_file(&f1).unwrap();
247    }
248
249    #[test]
250    #[should_panic(expected = "Must provide one or more sample")]
251    fn test_reading_empty_file() {
252        let lines = vec![""];
253        let tempdir = TempDir::new().unwrap();
254        let f1 = tempdir.path().join("sample_metadata.tsv");
255
256        let io = Io::default();
257        io.write_lines(&f1, &lines).unwrap();
258        let _sm = SampleGroup::from_file(&f1).unwrap();
259    }
260
261    #[test]
262    fn test_reading_non_existent_file() {
263        let tempdir = TempDir::new().unwrap();
264        let f1 = tempdir.path().join("sample_metadata.tsv");
265        if let fgoxide::FgError::IoError(e) = SampleGroup::from_file(&f1).unwrap_err() {
266            assert_eq!(e.to_string(), "No such file or directory (os error 2)");
267        } else {
268            panic!("Different error than expected reading non-existent file")
269        }
270    }
271
272    // ############################################################################################
273    // Test [`Sample::new`] - Expected to pass
274    // ############################################################################################
275    #[test]
276    fn test_new_sample_success() {
277        let name = "s_1_example_name".to_owned();
278        let barcode = "GATTACA".to_owned();
279        let ordinal = 0;
280        let sample = Sample::new(ordinal, name.clone(), barcode.clone());
281        assert_eq!(
282            Sample { sample_id: name, barcode, ordinal },
283            sample,
284            "Sample differed from expectation"
285        );
286    }
287
288    // ############################################################################################
289    // Test [`Sample::new`] - Expected to panic
290    // ############################################################################################
291
292    #[test]
293    #[should_panic(expected = "Sample name cannot be empty")]
294    fn test_new_sample_fail1_empty_sample_name() {
295        let name = String::new();
296        let barcode = "GATTACA".to_owned();
297        let ordinal = 0;
298        let _sample = Sample::new(ordinal, name, barcode);
299    }
300
301    #[test]
302    #[should_panic(expected = "Sample barcode cannot be empty")]
303    fn test_new_sample_fail2_empty_barcode() {
304        let name = "s_1_example_name".to_owned();
305        let barcode = String::new();
306        let ordinal = 0;
307        let _sample = Sample::new(ordinal, name, barcode);
308    }
309
310    // ############################################################################################
311    // Test [`SampleGroup::from_samples`] - expected to pass
312    // ############################################################################################
313    #[test]
314    fn test_from_samples_sample_group_pass1_single_sample() {
315        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
316        let samples_vec = vec![sample1.clone()];
317        let sample_group = SampleGroup::from_samples(&samples_vec);
318
319        assert_eq!(sample_group, SampleGroup { samples: vec![sample1] });
320    }
321
322    #[test]
323    fn test_from_samples_sample_group_pass2_multi_unique_samples() {
324        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
325        let sample2 = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
326        let samples_vec = vec![sample1.clone(), sample2.clone()];
327        let sample_group = SampleGroup::from_samples(&samples_vec);
328
329        assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2] });
330    }
331
332    #[test]
333    fn test_from_samples_sample_group_pass3_ordinal_values_will_be_changed_by_new() {
334        let sample1 = Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned());
335        let sample2_before = Sample::new(2, "sample_2".to_owned(), "CATGGAT".to_owned());
336        let sample2_after = Sample::new(1, "sample_2".to_owned(), "CATGGAT".to_owned());
337        let samples_vec = vec![sample1.clone(), sample2_before];
338        let sample_group = SampleGroup::from_samples(&samples_vec);
339
340        assert_eq!(sample_group, SampleGroup { samples: vec![sample1, sample2_after] });
341    }
342
343    // ############################################################################################
344    // Test [`SampleGroup::from_samples`] - expected to panic
345    // ############################################################################################
346    #[test]
347    #[should_panic(expected = "Must provide one or more sample")]
348    fn test_from_samples_sample_group_fail1_no_samples() {
349        let samples = vec![];
350        let _sample_group = SampleGroup::from_samples(&samples);
351    }
352
353    #[test]
354    #[should_panic(expected = "Each sample name must be unique, duplicate identified")]
355    fn test_from_samples_sample_group_fail2_duplicate_barcodes() {
356        let samples = vec![
357            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
358            Sample::new(0, "sample_1".to_owned(), "CATGGAT".to_owned()),
359        ];
360        let _sample_group = SampleGroup::from_samples(&samples);
361    }
362
363    #[test]
364    #[should_panic(expected = "Each sample barcode must be unique, duplicate identified")]
365    fn test_from_samples_sample_group_fail3_duplicate_sample_names() {
366        let samples = vec![
367            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
368            Sample::new(0, "sample_2".to_owned(), "GATTACA".to_owned()),
369        ];
370        let _sample_group = SampleGroup::from_samples(&samples);
371    }
372
373    #[test]
374    #[should_panic(expected = "All barcodes must have the same length")]
375    fn test_from_samples_sample_group_fail4_barcodes_of_different_lengths() {
376        let samples = vec![
377            Sample::new(0, "sample_1".to_owned(), "GATTACA".to_owned()),
378            Sample::new(0, "sample_2".to_owned(), "CATGGA".to_owned()),
379        ];
380        let _sample_group = SampleGroup::from_samples(&samples);
381    }
382}