fqtk_lib/
barcode_matching.rs

1use super::byte_is_nocall;
2use ahash::HashMap as AHashMap;
3use ahash::HashMapExt;
4use bstr::{BString, ByteSlice};
5
6const STARTING_CACHE_SIZE: usize = 1_000_000;
7
8/// The struct that contains the info related to the best and next best sample barcode match.
9#[derive(Copy, Clone, Debug, PartialEq, Eq)]
10pub struct BarcodeMatch {
11    /// Index of the best bardcode match in the corresponding BarcodeMatcher struct that generated
12    /// this match.
13    pub best_match: usize,
14    /// The number of mismatches to the best matching barcode for the read described by this match.
15    pub best_mismatches: u8,
16    /// The number of mismatches to the second best matching barcode for the read described by this
17    /// match
18    pub next_best_mismatches: u8,
19}
20
21/// The struct responsible for matching barcodes to a ``Vec`` of sample barcodes.
22#[derive(Clone, Debug)]
23pub struct BarcodeMatcher {
24    /// Vec of the barcodes for each sample
25    /// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things
26    /// very simple.
27    sample_barcodes: Vec<BString>,
28    /// The maxium number of Ns in any barcode in set of sample barcodes
29    max_ns_in_barcodes: usize,
30    /// The maximum mismatches to match a sample barcode.
31    max_mismatches: u8,
32    /// The minimum difference between number of mismatches in the best and second best barcodes
33    /// for a barcode to be considered a match.
34    min_mismatch_delta: u8,
35    /// If true will attempt to use the cache when matching.
36    use_cache: bool,
37    /// Caching struct for storing results of previous matches
38    cache: AHashMap<Vec<u8>, BarcodeMatch>,
39}
40
41impl BarcodeMatcher {
42    /// Instantiates a new ``BarcodeMatcher`` struct. Checks that the sample barcodes vector is not
43    /// empty and that none of the barcodes provided are the empty string.
44    ///
45    /// # Panics
46    /// - Will panic if provided an empty vec of sample barcodes.
47    /// - Will panic if any provided barcode is length zero.
48    #[must_use]
49    pub fn new(
50        sample_barcodes: &[&str],
51        max_mismatches: u8,
52        min_mismatch_delta: u8,
53        use_cache: bool,
54    ) -> Self {
55        assert!(!sample_barcodes.is_empty(), "Must provide at least one sample barcode");
56        assert!(
57            sample_barcodes.iter().all(|b| !b.is_empty()),
58            "Sample barcode cannot be empty string"
59        );
60
61        let mut max_ns_in_barcodes = 0;
62        let modified_sample_barcodes = sample_barcodes
63            .iter()
64            .map(|barcode| {
65                let barcode = BString::from(barcode.to_ascii_uppercase());
66                let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count();
67                max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
68                barcode
69            })
70            .collect::<Vec<_>>();
71
72        Self {
73            sample_barcodes: modified_sample_barcodes,
74            max_ns_in_barcodes,
75            max_mismatches,
76            min_mismatch_delta,
77            use_cache,
78            cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
79        }
80    }
81
82    /// Counts the number of bases that differ between two byte arrays.
83    fn count_mismatches(observed_bases: &[u8], expected_bases: &[u8]) -> u8 {
84        assert_eq!(
85            observed_bases.len(),
86            expected_bases.len(),
87            "observed_bases: {}, expected_bases: {}",
88            observed_bases.len(),
89            expected_bases.len()
90        );
91        let mut count: usize = 0;
92        for (&expected_base, &observed_base) in expected_bases.iter().zip(observed_bases.iter()) {
93            if !byte_is_nocall(expected_base) && expected_base != observed_base {
94                count += 1;
95            }
96        }
97        u8::try_from(count).expect("Overflow on number of mismatch bases")
98    }
99
100    /// Returns the expected barcode length, assuming a fixed length for all samples.
101    fn expected_barcode_length(&self) -> usize {
102        self.sample_barcodes[0].len()
103    }
104
105    /// Assigns the barcode that best matches the provided ``read_bases``.
106    #[must_use]
107    fn assign_internal(&self, read_bases: &[u8]) -> Option<BarcodeMatch> {
108        let mut best_barcode_index = self.sample_barcodes.len();
109        let mut best_mismatches = 255u8;
110        let mut next_best_mismatches = 255u8;
111        for (index, sample_barcode) in self.sample_barcodes.iter().enumerate() {
112            let mismatches = Self::count_mismatches(read_bases, sample_barcode.as_bstr());
113
114            if mismatches < best_mismatches {
115                next_best_mismatches = best_mismatches;
116                best_mismatches = mismatches;
117                best_barcode_index = index;
118            } else if mismatches < next_best_mismatches {
119                next_best_mismatches = mismatches;
120            }
121        }
122
123        if best_mismatches > self.max_mismatches
124            || (next_best_mismatches - best_mismatches) < self.min_mismatch_delta
125        {
126            None
127        } else {
128            Some(BarcodeMatch {
129                best_match: best_barcode_index,
130                best_mismatches,
131                next_best_mismatches,
132            })
133        }
134    }
135
136    /// Assigns the barcode that best matches the provided ``read_bases``, using internal caching
137    /// if configured to do so and skipping calculation for reads that cannot match any barcode (
138    /// due to having too many no-called bases).
139    pub fn assign(&mut self, read_bases: &[u8]) -> Option<BarcodeMatch> {
140        // do not try matching if there are not enough bases
141        if read_bases.len() < self.expected_barcode_length() {
142            return None;
143        }
144        let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count();
145        if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes {
146            None
147        } else if self.use_cache {
148            if let Some(cached_match) = self.cache.get(read_bases) {
149                Some(*cached_match)
150            } else {
151                let maybe_match = self.assign_internal(read_bases);
152                if let Some(internal_val) = maybe_match {
153                    self.cache.insert(read_bases.to_vec(), internal_val);
154                };
155                maybe_match
156            }
157        } else {
158            self.assign_internal(read_bases)
159        }
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166    use rstest::rstest;
167
168    // ############################################################################################
169    // Test ``BarcodeMatcher`` instantiation panics.
170    // ############################################################################################
171    #[rstest]
172    #[case(true)]
173    #[case(false)]
174    fn test_barcode_matcher_instantiation_can_succeed(#[case] use_cache: bool) {
175        let sample_barcodes = vec!["AGCT"];
176        let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
177    }
178
179    #[rstest]
180    #[case(true)]
181    #[case(false)]
182    #[should_panic(expected = "Must provide at least one sample barcode")]
183    fn test_barcode_matcher_fails_if_no_sample_barcodes_provided(#[case] use_cache: bool) {
184        let sample_barcodes: Vec<&str> = vec![];
185        let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
186    }
187
188    #[rstest]
189    #[case(true)]
190    #[case(false)]
191    #[should_panic(expected = "Sample barcode cannot be empty string")]
192    fn test_barcode_matcher_fails_if_empty_sample_barcode_provided(#[case] use_cache: bool) {
193        let sample_barcodes = vec!["AGCT", ""];
194        let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
195    }
196
197    // ############################################################################################
198    // Test BarcodeMatcher::count_mismatches
199    // ############################################################################################
200
201    // Thought process behind not panicking on empty string is:
202    //   1. sample barcodes are checked to see if they are empty and sample vs read barcodes are
203    //      compared for length, so empty string will fail anyway
204    //   2. the fewer operations / better optimization in this core matching function the better.
205    #[test]
206    fn empty_string_can_run_in_count_mismatches() {
207        assert_eq!(BarcodeMatcher::count_mismatches("".as_bytes(), "".as_bytes()), 0);
208    }
209
210    #[test]
211    fn find_no_mismatches() {
212        assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GATTACA".as_bytes()), 0,);
213    }
214
215    #[test]
216    fn ns_in_expected_barcode_dont_contribute_to_mismatch_counter() {
217        assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
218    }
219
220    #[test]
221    fn all_ns_barcode_have_no_mismatches() {
222        assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,);
223    }
224
225    #[test]
226    fn find_two_mismatches() {
227        assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,);
228    }
229
230    #[test]
231    fn not_count_no_calls() {
232        assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
233    }
234
235    #[test]
236    fn find_compare_two_sequences_that_have_all_mismatches() {
237        assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "CTAATGT".as_bytes()), 7,);
238    }
239
240    #[test]
241    #[should_panic(expected = "observed_bases: 5, expected_bases: 6")]
242    fn find_compare_two_sequences_of_different_length() {
243        let _mismatches = BarcodeMatcher::count_mismatches("GATTA".as_bytes(), "CTATGT".as_bytes());
244    }
245
246    // ############################################################################################
247    // Test BarcodeMatcher::assign
248    // ############################################################################################
249    // Some returns
250    #[rstest]
251    #[case(true)]
252    #[case(false)]
253    fn test_assign_exact_match(#[case] use_cache: bool) {
254        const EXPECTED_BARCODE_INDEX: usize = 0;
255        let sample_barcodes = vec!["ACGT", "AAAG", "CACA"];
256        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
257        assert_eq!(
258            matcher.assign(sample_barcodes[EXPECTED_BARCODE_INDEX].as_bytes()),
259            Some(BarcodeMatch {
260                best_match: EXPECTED_BARCODE_INDEX,
261                best_mismatches: 0,
262                next_best_mismatches: 3,
263            }),
264        );
265    }
266
267    #[rstest]
268    #[case(true)]
269    #[case(false)]
270    fn test_assign_imprecise_match(#[case] use_cache: bool) {
271        let sample_barcodes = vec!["AAAT", "AGAG", "CACA"];
272        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
273        //                          1 different base
274        //                          |
275        //                          v
276        let test_barcode: &[u8] = b"GAAT";
277        let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 3 };
278        assert_eq!(matcher.assign(test_barcode), Some(expected));
279    }
280
281    #[rstest]
282    #[case(true)]
283    #[case(false)]
284    fn test_assign_precise_match_with_no_call(#[case] use_cache: bool) {
285        let sample_barcodes = vec!["AAAT", "AGAG", "CACA"];
286        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
287        //                             1 no-call
288        //                             |
289        //                             v
290        let test_barcode: &[u8; 4] = b"NAAT";
291        let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 3 };
292        assert_eq!(matcher.assign(test_barcode), Some(expected));
293    }
294
295    #[rstest]
296    #[case(true)]
297    #[case(false)]
298    fn test_assign_imprecise_match_with_no_call(#[case] use_cache: bool) {
299        let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"];
300        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
301        //                             1 no-call
302        //                             |
303        //                             | 1 different base
304        //                             | |
305        //                             v v
306        let test_barcode: &[u8; 6] = b"NAGTTT";
307        let expected = BarcodeMatch { best_match: 0, best_mismatches: 2, next_best_mismatches: 5 };
308        assert_eq!(matcher.assign(test_barcode), Some(expected));
309    }
310
311    #[rstest]
312    #[case(true)]
313    #[case(false)]
314    fn test_sample_no_call_doesnt_contribute_to_mismatch_number(#[case] use_cache: bool) {
315        let sample_barcodes = vec!["NAGTTT", "AGAGGG", "CACAGG"];
316        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache);
317        //                             1 no-call
318        //                             |
319        //                             | 1 different base
320        //                             | |
321        //                             v v
322        let test_barcode: &[u8; 6] = b"AAATTT";
323        let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 4 };
324        assert_eq!(matcher.assign(test_barcode), Some(expected));
325    }
326
327    // None returns
328    #[rstest]
329    #[case(true)]
330    #[case(false)]
331    fn test_read_no_call_contributes_to_mismatch_number(#[case] use_cache: bool) {
332        let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"];
333        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache);
334        //                             1 no-call
335        //                             |
336        //                             | 1 different base
337        //                             | |
338        //                             v v
339        let test_barcode: &[u8; 6] = b"NAGTTT";
340        assert_eq!(matcher.assign(test_barcode), None);
341    }
342
343    #[rstest]
344    #[case(true)]
345    #[case(false)]
346    fn test_produce_no_match_if_too_many_mismatches(#[case] use_cache: bool) {
347        let sample_barcodes = vec!["AAGCTAG", "CAGCTAG", "GAGCTAG", "TAGCTAG"];
348        let assignment_barcode: &[u8] = b"ATCGATC";
349        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache);
350        assert_eq!(matcher.assign(assignment_barcode), None);
351    }
352
353    #[rstest]
354    #[case(true)]
355    #[case(false)]
356    fn test_produce_no_match_if_within_mismatch_delta(#[case] use_cache: bool) {
357        let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"];
358        let assignment_barcode: &[u8] = sample_barcodes[3].as_bytes();
359        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 100, 3, use_cache);
360        assert_eq!(matcher.assign(assignment_barcode), None);
361    }
362
363    #[rstest]
364    #[case(true)]
365    #[case(false)]
366    fn test_produce_no_match_if_too_many_mismatches_via_nocalls(#[case] use_cache: bool) {
367        let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"];
368        let assignment_barcode: &[u8] = b"GGGGGGTN";
369        let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache);
370        assert_eq!(matcher.assign(assignment_barcode), None);
371    }
372}