1use core::ops::Range;
2use std::convert::{From, TryFrom};
3use std::iter::{FromIterator, IntoIterator};
4
5use rand::rngs::SmallRng;
6use rand::{RngCore, SeedableRng};
7
8#[derive(Copy, Clone, Debug, PartialEq, Eq)]
10pub enum NucleotideConversionError {
11 InvalidBase(u8),
13 AmbiguousBase,
15}
16
17impl std::fmt::Display for NucleotideConversionError {
18 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19 write!(f, "{:?}", self)
20 }
21}
22
23impl std::error::Error for NucleotideConversionError {}
24
25const NCBI8NA_TO_IUPACNA: [u8; 16] = [
26 u8::MAX,
27 b'A',
28 b'C',
29 b'M',
30 b'G',
31 b'R',
32 b'S',
33 b'V',
34 b'T',
35 b'W',
36 b'Y',
37 b'H',
38 b'K',
39 b'D',
40 b'B',
41 b'N',
42];
43
44const fn generate_iupacna_to_ncbi8na() -> [u8; 256] {
45 let mut table = [u8::MAX; 256];
46 let mut i = 1;
47 while i < NCBI8NA_TO_IUPACNA.len() {
48 table[NCBI8NA_TO_IUPACNA[i] as usize] = i as u8;
49 table[NCBI8NA_TO_IUPACNA[i].to_ascii_lowercase() as usize] = i as u8;
50 i += 1;
51 }
52 table
53}
54const IUPACNA_TO_NCBI8NA: [u8; 256] = generate_iupacna_to_ncbi8na();
55
56const fn generate_ncbi8na_to_ncbi2na() -> [u8; 16] {
57 let mut table = [u8::MAX; 16];
58 let mut i = 0usize;
59 while i < 4 {
60 table[1 << i] = i as u8;
61 i += 1;
62 }
63 table
64}
65const NCBI8NA_TO_NCBI2NA: [u8; 16] = generate_ncbi8na_to_ncbi2na();
69
70#[repr(transparent)]
76#[derive(PartialEq, Eq, Copy, Clone, Debug)]
77pub struct Ncbi8naBase(u8);
78
79impl std::fmt::Display for Ncbi8naBase {
80 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81 write!(f, "{}", self.0)
82 }
83}
84
85impl TryFrom<u8> for Ncbi8naBase {
87 type Error = NucleotideConversionError;
88
89 #[inline]
90 fn try_from(value: u8) -> Result<Ncbi8naBase, Self::Error> {
91 let b = IUPACNA_TO_NCBI8NA[value as usize];
92 if b != u8::MAX {
93 Ok(Ncbi8naBase(b))
94 } else {
95 Err(NucleotideConversionError::InvalidBase(value))
96 }
97 }
98}
99
100impl From<Ncbi8naBase> for u8 {
102 #[inline]
103 fn from(value: Ncbi8naBase) -> Self {
104 NCBI8NA_TO_IUPACNA[value.0 as usize]
105 }
106}
107
108impl From<Ncbi8naBase> for char {
109 #[inline]
110 fn from(value: Ncbi8naBase) -> Self {
111 char::from(u8::from(value))
112 }
113}
114
115#[repr(transparent)]
119#[derive(PartialEq, Eq, Copy, Clone, Debug)]
120pub struct Ncbi2naBase(u8);
121
122impl TryFrom<Ncbi8naBase> for Ncbi2naBase {
123 type Error = NucleotideConversionError;
124
125 #[inline]
126 fn try_from(value: Ncbi8naBase) -> Result<Self, Self::Error> {
127 let b2 = NCBI8NA_TO_NCBI2NA[value.0 as usize];
128 if b2 != u8::MAX {
129 Ok(Ncbi2naBase(b2))
130 } else {
131 Err(NucleotideConversionError::AmbiguousBase)
132 }
133 }
134}
135
136impl From<Ncbi2naBase> for Ncbi8naBase {
137 #[inline]
138 fn from(value: Ncbi2naBase) -> Self {
139 Ncbi8naBase(1 << value.0)
140 }
141}
142
143impl From<Ncbi2naBase> for u8 {
144 #[inline]
145 fn from(value: Ncbi2naBase) -> Self {
146 u8::from(Ncbi8naBase::from(value))
147 }
148}
149
150impl From<Ncbi2naBase> for char {
151 #[inline]
152 fn from(value: Ncbi2naBase) -> Self {
153 char::from(Ncbi8naBase::from(value))
154 }
155}
156
157#[repr(transparent)]
161#[derive(PartialEq, Eq, Copy, Clone, Debug)]
162pub struct Blast8naBase(u8);
163
164const NCBI8NA_TO_BLAST8NA: [u8; 16] = [
165 15, 0, 1, 6, 2, 4, 9, 13, 3, 8, 5, 12, 7, 11, 10, 14, ];
182const fn generate_blastna8_to_ncbi8na() -> [u8; 16] {
183 let mut table = [0u8; 16];
184 let mut i = 0usize;
185 while i < table.len() {
186 table[NCBI8NA_TO_BLAST8NA[i] as usize] = i as u8;
187 i += 1;
188 }
189 table
190}
191const BLASTNA8_TO_NCBI8NA: [u8; 16] = generate_blastna8_to_ncbi8na();
192
193impl From<Ncbi8naBase> for Blast8naBase {
194 fn from(value: Ncbi8naBase) -> Self {
195 Blast8naBase(NCBI8NA_TO_BLAST8NA[value.0 as usize])
196 }
197}
198
199impl From<Blast8naBase> for Ncbi8naBase {
200 fn from(value: Blast8naBase) -> Self {
201 Ncbi8naBase(BLASTNA8_TO_NCBI8NA[value.0 as usize])
202 }
203}
204
205const OLD_MAX_REGION_LEN: usize = 0xf;
206const OLD_MAX_OFFSET: usize = 0xffffff;
207const NEW_MAX_REGION_LEN: usize = 0xfff;
208
209#[derive(Clone, Debug)]
211struct AmbiguousRegion {
212 range: Range<usize>,
213 base: Ncbi8naBase,
214}
215
216impl AmbiguousRegion {
217 fn to_old_format(&self) -> u32 {
218 debug_assert!(self.range.start <= OLD_MAX_OFFSET);
219 debug_assert!(self.range.len() <= OLD_MAX_REGION_LEN);
220 self.range.start as u32 | ((self.range.len() as u32) << 24) | ((self.base.0 as u32) << 28)
221 }
222
223 fn from_old_format(w: u32) -> Self {
224 let start = w as usize & OLD_MAX_OFFSET;
225 let end = start + ((w as usize >> 24) & OLD_MAX_REGION_LEN);
226 AmbiguousRegion {
227 range: start..end,
228 base: Ncbi8naBase((w >> 28) as u8),
229 }
230 }
231
232 fn to_new_format(&self) -> u64 {
233 debug_assert!(self.range.start <= u32::MAX as usize);
234 debug_assert!(self.range.len() <= NEW_MAX_REGION_LEN);
235 self.range.start as u64 | ((self.range.len() as u64) << 48) | ((self.base.0 as u64) << 60)
236 }
237
238 fn from_new_format(w0: u32, w1: u32) -> Self {
239 let start = w1 as usize;
240 let end = start + ((w0 as usize >> 16) & NEW_MAX_REGION_LEN);
241 AmbiguousRegion {
242 range: start..end,
243 base: Ncbi8naBase((w0 >> 28) as u8),
244 }
245 }
246
247 fn end_region(len: usize) -> Self {
248 AmbiguousRegion {
249 range: len..len,
250 base: Ncbi8naBase(0xf),
251 }
252 }
253}
254
255struct AmbiguityEncoder {
265 regions: Vec<AmbiguousRegion>,
266 offset: usize,
267 longest_region: usize,
268 rng: SmallRng,
269}
270
271impl AmbiguityEncoder {
272 pub fn new(size_hint: usize) -> Self {
273 AmbiguityEncoder {
274 regions: vec![],
275 offset: 0,
276 longest_region: 0,
277 rng: SmallRng::seed_from_u64(size_hint as u64),
278 }
279 }
280
281 pub fn add(&mut self, base: Ncbi8naBase) -> Ncbi2naBase {
282 let mut base2 = NCBI8NA_TO_NCBI2NA[base.0 as usize];
283 if base2 == u8::MAX {
284 base2 = self.add_ambiguous(base);
285 }
286 self.offset += 1;
287 Ncbi2naBase(base2)
288 }
289
290 fn add_ambiguous(&mut self, base: Ncbi8naBase) -> u8 {
291 if let Some(last) = self
292 .regions
293 .last_mut()
294 .filter(|last| last.range.end == self.offset && last.base == base)
295 {
296 last.range.end += 1;
297 self.longest_region = std::cmp::max(self.longest_region, last.range.len());
298 } else {
299 self.regions.push(AmbiguousRegion {
300 range: self.offset..(self.offset + 1),
301 base,
302 })
303 }
304
305 debug_assert_ne!(base.0, 0);
307 if base.0 == 0xf {
308 (self.rng.next_u32() & 0x3) as u8
309 } else {
310 let mut b = base.0;
311 let mut select = self.rng.next_u32() % base.0.count_ones();
312 while select > 0 {
313 b &= (1 << b.trailing_zeros()) - 1;
314 select -= 1;
315 }
316 b.trailing_zeros() as u8
317 }
318 }
319
320 fn use_new_format(&self) -> bool {
321 self.offset > OLD_MAX_OFFSET || self.longest_region > OLD_MAX_REGION_LEN
322 }
323
324 fn into_new_format(self) -> Vec<u8> {
325 let mut encoded =
326 Vec::with_capacity((self.regions.len() * 2 + 1) * std::mem::size_of::<u32>());
327 let hdr = ((self.regions.len() as u32) * 2) | (1u32 << 31);
328 encoded.extend_from_slice(&hdr.to_be_bytes());
329 for mut region in self.regions {
330 while region.range.len() > NEW_MAX_REGION_LEN {
331 let subr = AmbiguousRegion {
332 range: region.range.start..(region.range.start + NEW_MAX_REGION_LEN),
333 base: region.base,
334 };
335 encoded.extend_from_slice(&subr.to_new_format().to_be_bytes());
336 region.range.start += NEW_MAX_REGION_LEN;
337 }
338 encoded.extend_from_slice(®ion.to_new_format().to_be_bytes())
339 }
340 encoded
341 }
342
343 fn into_old_format(self) -> Vec<u8> {
344 let mut encoded = Vec::with_capacity((self.regions.len() + 1) * std::mem::size_of::<u32>());
345 encoded.extend_from_slice(&(self.regions.len() as u32).to_be_bytes());
346 for region in self.regions {
347 encoded.extend_from_slice(®ion.to_old_format().to_be_bytes())
348 }
349 encoded
350 }
351}
352
353impl From<AmbiguityEncoder> for Vec<u8> {
354 fn from(value: AmbiguityEncoder) -> Vec<u8> {
355 if value.regions.is_empty() {
356 vec![]
357 } else if value.use_new_format() {
358 value.into_new_format()
359 } else {
360 value.into_old_format()
361 }
362 }
363}
364
365struct AmbiguityIterator<'a> {
366 amb: &'a [u8],
367 new_format: bool,
368}
369
370impl<'a> AmbiguityIterator<'a> {
371 fn new(mut amb: &'a [u8]) -> Self {
372 amb = &amb[..(amb.len() & !0x3)];
374 let mut it = Self {
375 amb,
376 new_format: false,
377 };
378 let new_format = it.read_u32().filter(|h| h & (1 << 31) > 0).is_some();
379 it.new_format = new_format;
380 it
381 }
382
383 #[inline]
384 fn read_u32(&mut self) -> Option<u32> {
385 if !self.amb.is_empty() {
386 let (u32_bytes, rest) = self.amb.split_at(std::mem::size_of::<u32>());
387 self.amb = rest;
388 Some(u32::from_be_bytes(u32_bytes.try_into().unwrap()))
389 } else {
390 None
391 }
392 }
393}
394
395impl<'a> Iterator for AmbiguityIterator<'a> {
396 type Item = AmbiguousRegion;
397
398 #[inline]
399 fn next(&mut self) -> Option<Self::Item> {
400 let w0 = self.read_u32()?;
401 if self.new_format {
402 let w1 = self.read_u32()?;
403 Some(AmbiguousRegion::from_new_format(w0, w1))
404 } else {
405 Some(AmbiguousRegion::from_old_format(w0))
406 }
407 }
408}
409
410pub struct NucleotideSequence {
424 seq: Vec<u8>,
425 amb: Vec<u8>,
426}
427
428impl NucleotideSequence {
429 pub fn new(seq: Vec<u8>, amb: Vec<u8>) -> NucleotideSequence {
431 Self { seq, amb }
432 }
433
434 pub fn len(&self) -> usize {
436 if let Some(b) = self.seq.last() {
437 (self.seq.len() - 1) * 4 + (*b as usize & 0x3)
438 } else {
439 0
440 }
441 }
442
443 pub fn is_empty(&self) -> bool {
444 self.len() == 0
445 }
446
447 pub fn iter(&self) -> SequenceIter<'_, '_> {
449 SequenceIter::new(&self.seq, &self.amb)
450 }
451
452 pub fn sequence_bytes(&self) -> &[u8] {
454 &self.seq
455 }
456
457 pub fn ambiguity_bytes(&self) -> &[u8] {
459 &self.amb
460 }
461}
462
463const NCBI2NA_SHIFT: [u8; 4] = [6, 4, 2, 0];
464
465impl FromIterator<Ncbi8naBase> for NucleotideSequence {
466 fn from_iter<T>(into_iter: T) -> Self
467 where
468 T: IntoIterator<Item = Ncbi8naBase>,
469 {
470 let iter = into_iter.into_iter();
471 let (lower, upper) = iter.size_hint();
472 let hint = upper.unwrap_or(lower);
473 let mut seq = Vec::with_capacity((hint / 4) + 1);
474 let mut amb = AmbiguityEncoder::new(hint);
475 let (mut buf, mut nbuf) = (0u8, 0usize);
476 for base in iter {
477 let base2 = amb.add(base);
478 buf |= base2.0 << NCBI2NA_SHIFT[nbuf & 0x3];
479 nbuf += 1;
480 if nbuf == 4 {
481 seq.push(buf);
482 buf = 0;
483 nbuf = 0;
484 }
485 }
486 buf |= (nbuf & 0x3) as u8;
487 seq.push(buf);
488 NucleotideSequence {
489 seq,
490 amb: amb.into(),
491 }
492 }
493}
494
495impl std::str::FromStr for NucleotideSequence {
496 type Err = NucleotideConversionError;
497
498 fn from_str(s: &str) -> Result<Self, Self::Err> {
499 s.as_bytes()
500 .iter()
501 .map(|b| Ncbi8naBase::try_from(*b))
502 .collect::<Result<_, _>>()
503 }
504}
505
506struct CompressedSequenceIter<'a> {
507 seq: &'a [u8],
508 len: usize,
509 i: usize,
510}
511
512impl<'a> CompressedSequenceIter<'a> {
513 fn new(seq: &'a [u8]) -> Self {
514 let rem = seq.last().copied().unwrap_or(0) as usize & 0x3;
515 let len = (seq.len() - 1) * 4 + rem;
516 Self { seq, len, i: 0 }
517 }
518}
519
520impl<'a> Iterator for CompressedSequenceIter<'a> {
521 type Item = (usize, Ncbi2naBase);
522
523 #[inline]
524 fn next(&mut self) -> Option<Self::Item> {
525 if self.i < self.len {
526 let i = self.i;
527 let raw_base = (self.seq[i / 4] >> NCBI2NA_SHIFT[i & 3]) & 3;
528 self.i += 1;
529 Some((i, Ncbi2naBase(raw_base)))
530 } else {
531 None
532 }
533 }
534
535 #[inline]
536 fn nth(&mut self, n: usize) -> Option<Self::Item> {
537 self.i += n;
538 self.next()
539 }
540
541 #[inline]
542 fn size_hint(&self) -> (usize, Option<usize>) {
543 (self.len, Some(self.len))
544 }
545}
546
547impl<'a> ExactSizeIterator for CompressedSequenceIter<'a> {}
548
549pub struct SequenceIter<'a, 'b> {
550 seq_it: CompressedSequenceIter<'a>,
551 amb_it: AmbiguityIterator<'b>,
552 amb_region: AmbiguousRegion,
553}
554
555impl<'a, 'b> SequenceIter<'a, 'b> {
556 fn new(seq: &'a [u8], amb: &'b [u8]) -> Self {
557 let seq_it = CompressedSequenceIter::new(seq);
558 let mut amb_it = AmbiguityIterator::new(amb);
559 let amb_region = amb_it
560 .next()
561 .unwrap_or_else(|| AmbiguousRegion::end_region(seq_it.len()));
562 Self {
563 seq_it,
564 amb_it,
565 amb_region,
566 }
567 }
568
569 fn next_region(&mut self) -> AmbiguousRegion {
570 self.amb_it
571 .next()
572 .unwrap_or_else(|| AmbiguousRegion::end_region(self.seq_it.len()))
573 }
574}
575
576impl<'a, 'b> Iterator for SequenceIter<'a, 'b> {
577 type Item = Ncbi8naBase;
578
579 #[inline]
580 fn next(&mut self) -> Option<Self::Item> {
581 #[allow(clippy::iter_nth_zero)]
582 self.nth(0)
583 }
584
585 #[inline]
586 fn nth(&mut self, n: usize) -> Option<Self::Item> {
587 let (i, base2) = self.seq_it.nth(n)?;
588 while i >= self.amb_region.range.end {
589 self.amb_region = self.next_region();
590 }
591 if i < self.amb_region.range.start {
592 Some(base2.into())
593 } else {
594 Some(self.amb_region.base)
595 }
596 }
597
598 #[inline]
599 fn size_hint(&self) -> (usize, Option<usize>) {
600 self.seq_it.size_hint()
601 }
602}
603
604impl<'a, 'b> ExactSizeIterator for SequenceIter<'a, 'b> {}
605
606impl std::fmt::Display for NucleotideSequence {
608 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
609 write!(f, "{}", self.iter().map(char::from).collect::<String>())
610 }
611}
612
613#[cfg(test)]
614mod test {
615 use super::{Ncbi2naBase, Ncbi8naBase, NucleotideConversionError, NucleotideSequence};
616
617 #[test]
618 fn convert_iupacna_ncbi8na() {
619 assert_eq!(u8::from(Ncbi8naBase::try_from(b'A').unwrap()), b'A');
620 assert_eq!(u8::from(Ncbi8naBase::try_from(b'A').unwrap()), b'A');
621 assert_eq!(u8::from(Ncbi8naBase::try_from(b'G').unwrap()), b'G');
622 assert_eq!(u8::from(Ncbi8naBase::try_from(b'N').unwrap()), b'N');
623 assert_eq!(
624 Ncbi8naBase::try_from(b'E'),
625 Err(NucleotideConversionError::InvalidBase(b'E'))
626 );
627 }
628
629 #[test]
630 fn convert_iupacna_ncbi2na() {
631 assert_eq!(
632 u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'A').unwrap()).unwrap()),
633 b'A'
634 );
635 assert_eq!(
636 u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'C').unwrap()).unwrap()),
637 b'C'
638 );
639 assert_eq!(
640 u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'G').unwrap()).unwrap()),
641 b'G'
642 );
643 assert_eq!(
644 u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'T').unwrap()).unwrap()),
645 b'T'
646 );
647 assert_eq!(
648 Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'D').unwrap()),
649 Err(NucleotideConversionError::AmbiguousBase)
650 );
651 }
652
653 fn test_sequence_conversion(
654 iupac_seq: &str,
655 expected_seq_bytes: usize,
656 expected_amb_words: usize,
657 ) {
658 let seq = iupac_seq
659 .as_bytes()
660 .into_iter()
661 .map(|b| Ncbi8naBase::try_from(*b))
662 .collect::<Result<NucleotideSequence, _>>()
663 .unwrap();
664 assert_eq!(seq.len(), iupac_seq.len());
665 assert_eq!(seq.sequence_bytes().len(), expected_seq_bytes);
666 assert_eq!(
667 seq.ambiguity_bytes().len() / std::mem::size_of::<u32>(),
668 expected_amb_words
669 );
670 assert_eq!(seq.to_string(), iupac_seq);
671 }
672
673 #[test]
674 fn simple_sequence() {
675 test_sequence_conversion("ACGTACGT", 3, 0);
676 }
677
678 #[test]
679 fn odd_base_sequence() {
680 test_sequence_conversion("ACGTA", 2, 0);
681 }
682
683 #[test]
684 fn completely_ambiguous_old_format() {
685 test_sequence_conversion("NNNNNNNN", 3, 2);
686 }
687
688 #[test]
689 fn completely_ambiguous_new_format() {
690 test_sequence_conversion("NNNNNNNNNNNNNNNN", 5, 3);
691 }
692
693 #[test]
694 fn some_ambiguous() {
695 test_sequence_conversion("AAGNCAGTNNGGCNTA", 5, 4);
696 }
697
698 #[test]
699 fn mixed_ambiguous() {
700 test_sequence_conversion("AAGYCAGTNMGGCNTA", 5, 5);
701 }
702}