1use super::byte_is_nocall;
2use ahash::HashMap as AHashMap;
3use ahash::HashMapExt;
4use bstr::{BString, ByteSlice};
5
6const STARTING_CACHE_SIZE: usize = 1_000_000;
7
8#[derive(Copy, Clone, Debug, PartialEq, Eq)]
10pub struct BarcodeMatch {
11 pub best_match: usize,
14 pub best_mismatches: u8,
16 pub next_best_mismatches: u8,
19}
20
21#[derive(Clone, Debug)]
23pub struct BarcodeMatcher {
24 sample_barcodes: Vec<BString>,
28 max_ns_in_barcodes: usize,
30 max_mismatches: u8,
32 min_mismatch_delta: u8,
35 use_cache: bool,
37 cache: AHashMap<Vec<u8>, BarcodeMatch>,
39}
40
41impl BarcodeMatcher {
42 #[must_use]
49 pub fn new(
50 sample_barcodes: &[&str],
51 max_mismatches: u8,
52 min_mismatch_delta: u8,
53 use_cache: bool,
54 ) -> Self {
55 assert!(!sample_barcodes.is_empty(), "Must provide at least one sample barcode");
56 assert!(
57 sample_barcodes.iter().all(|b| !b.is_empty()),
58 "Sample barcode cannot be empty string"
59 );
60
61 let mut max_ns_in_barcodes = 0;
62 let modified_sample_barcodes = sample_barcodes
63 .iter()
64 .map(|barcode| {
65 let barcode = BString::from(barcode.to_ascii_uppercase());
66 let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count();
67 max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
68 barcode
69 })
70 .collect::<Vec<_>>();
71
72 Self {
73 sample_barcodes: modified_sample_barcodes,
74 max_ns_in_barcodes,
75 max_mismatches,
76 min_mismatch_delta,
77 use_cache,
78 cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
79 }
80 }
81
82 fn count_mismatches(observed_bases: &[u8], expected_bases: &[u8]) -> u8 {
84 assert_eq!(
85 observed_bases.len(),
86 expected_bases.len(),
87 "observed_bases: {}, expected_bases: {}",
88 observed_bases.len(),
89 expected_bases.len()
90 );
91 let mut count: usize = 0;
92 for (&expected_base, &observed_base) in expected_bases.iter().zip(observed_bases.iter()) {
93 if !byte_is_nocall(expected_base) && expected_base != observed_base {
94 count += 1;
95 }
96 }
97 u8::try_from(count).expect("Overflow on number of mismatch bases")
98 }
99
100 fn expected_barcode_length(&self) -> usize {
102 self.sample_barcodes[0].len()
103 }
104
105 #[must_use]
107 fn assign_internal(&self, read_bases: &[u8]) -> Option<BarcodeMatch> {
108 let mut best_barcode_index = self.sample_barcodes.len();
109 let mut best_mismatches = 255u8;
110 let mut next_best_mismatches = 255u8;
111 for (index, sample_barcode) in self.sample_barcodes.iter().enumerate() {
112 let mismatches = Self::count_mismatches(read_bases, sample_barcode.as_bstr());
113
114 if mismatches < best_mismatches {
115 next_best_mismatches = best_mismatches;
116 best_mismatches = mismatches;
117 best_barcode_index = index;
118 } else if mismatches < next_best_mismatches {
119 next_best_mismatches = mismatches;
120 }
121 }
122
123 if best_mismatches > self.max_mismatches
124 || (next_best_mismatches - best_mismatches) < self.min_mismatch_delta
125 {
126 None
127 } else {
128 Some(BarcodeMatch {
129 best_match: best_barcode_index,
130 best_mismatches,
131 next_best_mismatches,
132 })
133 }
134 }
135
136 pub fn assign(&mut self, read_bases: &[u8]) -> Option<BarcodeMatch> {
140 if read_bases.len() < self.expected_barcode_length() {
142 return None;
143 }
144 let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count();
145 if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes {
146 None
147 } else if self.use_cache {
148 if let Some(cached_match) = self.cache.get(read_bases) {
149 Some(*cached_match)
150 } else {
151 let maybe_match = self.assign_internal(read_bases);
152 if let Some(internal_val) = maybe_match {
153 self.cache.insert(read_bases.to_vec(), internal_val);
154 };
155 maybe_match
156 }
157 } else {
158 self.assign_internal(read_bases)
159 }
160 }
161}
162
163#[cfg(test)]
164mod tests {
165 use super::*;
166 use rstest::rstest;
167
168 #[rstest]
172 #[case(true)]
173 #[case(false)]
174 fn test_barcode_matcher_instantiation_can_succeed(#[case] use_cache: bool) {
175 let sample_barcodes = vec!["AGCT"];
176 let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
177 }
178
179 #[rstest]
180 #[case(true)]
181 #[case(false)]
182 #[should_panic(expected = "Must provide at least one sample barcode")]
183 fn test_barcode_matcher_fails_if_no_sample_barcodes_provided(#[case] use_cache: bool) {
184 let sample_barcodes: Vec<&str> = vec![];
185 let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
186 }
187
188 #[rstest]
189 #[case(true)]
190 #[case(false)]
191 #[should_panic(expected = "Sample barcode cannot be empty string")]
192 fn test_barcode_matcher_fails_if_empty_sample_barcode_provided(#[case] use_cache: bool) {
193 let sample_barcodes = vec!["AGCT", ""];
194 let _matcher = BarcodeMatcher::new(&sample_barcodes, 2, 1, use_cache);
195 }
196
197 #[test]
206 fn empty_string_can_run_in_count_mismatches() {
207 assert_eq!(BarcodeMatcher::count_mismatches("".as_bytes(), "".as_bytes()), 0);
208 }
209
210 #[test]
211 fn find_no_mismatches() {
212 assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GATTACA".as_bytes()), 0,);
213 }
214
215 #[test]
216 fn ns_in_expected_barcode_dont_contribute_to_mismatch_counter() {
217 assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
218 }
219
220 #[test]
221 fn all_ns_barcode_have_no_mismatches() {
222 assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,);
223 }
224
225 #[test]
226 fn find_two_mismatches() {
227 assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,);
228 }
229
230 #[test]
231 fn not_count_no_calls() {
232 assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
233 }
234
235 #[test]
236 fn find_compare_two_sequences_that_have_all_mismatches() {
237 assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "CTAATGT".as_bytes()), 7,);
238 }
239
240 #[test]
241 #[should_panic(expected = "observed_bases: 5, expected_bases: 6")]
242 fn find_compare_two_sequences_of_different_length() {
243 let _mismatches = BarcodeMatcher::count_mismatches("GATTA".as_bytes(), "CTATGT".as_bytes());
244 }
245
246 #[rstest]
251 #[case(true)]
252 #[case(false)]
253 fn test_assign_exact_match(#[case] use_cache: bool) {
254 const EXPECTED_BARCODE_INDEX: usize = 0;
255 let sample_barcodes = vec!["ACGT", "AAAG", "CACA"];
256 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
257 assert_eq!(
258 matcher.assign(sample_barcodes[EXPECTED_BARCODE_INDEX].as_bytes()),
259 Some(BarcodeMatch {
260 best_match: EXPECTED_BARCODE_INDEX,
261 best_mismatches: 0,
262 next_best_mismatches: 3,
263 }),
264 );
265 }
266
267 #[rstest]
268 #[case(true)]
269 #[case(false)]
270 fn test_assign_imprecise_match(#[case] use_cache: bool) {
271 let sample_barcodes = vec!["AAAT", "AGAG", "CACA"];
272 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
273 let test_barcode: &[u8] = b"GAAT";
277 let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 3 };
278 assert_eq!(matcher.assign(test_barcode), Some(expected));
279 }
280
281 #[rstest]
282 #[case(true)]
283 #[case(false)]
284 fn test_assign_precise_match_with_no_call(#[case] use_cache: bool) {
285 let sample_barcodes = vec!["AAAT", "AGAG", "CACA"];
286 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
287 let test_barcode: &[u8; 4] = b"NAAT";
291 let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 3 };
292 assert_eq!(matcher.assign(test_barcode), Some(expected));
293 }
294
295 #[rstest]
296 #[case(true)]
297 #[case(false)]
298 fn test_assign_imprecise_match_with_no_call(#[case] use_cache: bool) {
299 let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"];
300 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 2, 2, use_cache);
301 let test_barcode: &[u8; 6] = b"NAGTTT";
307 let expected = BarcodeMatch { best_match: 0, best_mismatches: 2, next_best_mismatches: 5 };
308 assert_eq!(matcher.assign(test_barcode), Some(expected));
309 }
310
311 #[rstest]
312 #[case(true)]
313 #[case(false)]
314 fn test_sample_no_call_doesnt_contribute_to_mismatch_number(#[case] use_cache: bool) {
315 let sample_barcodes = vec!["NAGTTT", "AGAGGG", "CACAGG"];
316 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache);
317 let test_barcode: &[u8; 6] = b"AAATTT";
323 let expected = BarcodeMatch { best_match: 0, best_mismatches: 1, next_best_mismatches: 4 };
324 assert_eq!(matcher.assign(test_barcode), Some(expected));
325 }
326
327 #[rstest]
329 #[case(true)]
330 #[case(false)]
331 fn test_read_no_call_contributes_to_mismatch_number(#[case] use_cache: bool) {
332 let sample_barcodes = vec!["AAATTT", "AGAGGG", "CACAGG"];
333 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 1, 2, use_cache);
334 let test_barcode: &[u8; 6] = b"NAGTTT";
340 assert_eq!(matcher.assign(test_barcode), None);
341 }
342
343 #[rstest]
344 #[case(true)]
345 #[case(false)]
346 fn test_produce_no_match_if_too_many_mismatches(#[case] use_cache: bool) {
347 let sample_barcodes = vec!["AAGCTAG", "CAGCTAG", "GAGCTAG", "TAGCTAG"];
348 let assignment_barcode: &[u8] = b"ATCGATC";
349 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache);
350 assert_eq!(matcher.assign(assignment_barcode), None);
351 }
352
353 #[rstest]
354 #[case(true)]
355 #[case(false)]
356 fn test_produce_no_match_if_within_mismatch_delta(#[case] use_cache: bool) {
357 let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"];
358 let assignment_barcode: &[u8] = sample_barcodes[3].as_bytes();
359 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 100, 3, use_cache);
360 assert_eq!(matcher.assign(assignment_barcode), None);
361 }
362
363 #[rstest]
364 #[case(true)]
365 #[case(false)]
366 fn test_produce_no_match_if_too_many_mismatches_via_nocalls(#[case] use_cache: bool) {
367 let sample_barcodes = vec!["AAAAAAAA", "CCCCCCCC", "GGGGGGGG", "GGGGGGTT"];
368 let assignment_barcode: &[u8] = b"GGGGGGTN";
369 let mut matcher = BarcodeMatcher::new(&sample_barcodes, 0, 100, use_cache);
370 assert_eq!(matcher.assign(assignment_barcode), None);
371 }
372}