blastdb_sequence_util/
nucleotide.rs

1use core::ops::Range;
2use std::convert::{From, TryFrom};
3use std::iter::{FromIterator, IntoIterator};
4
5use rand::rngs::SmallRng;
6use rand::{RngCore, SeedableRng};
7
8/// Base conversion errors for nucleotide sequence conversions.
9#[derive(Copy, Clone, Debug, PartialEq, Eq)]
10pub enum NucleotideConversionError {
11    /// Input is an invalid base. The attached u8 is the IUPACNA coding of the base.
12    InvalidBase(u8),
13    /// Ambiguous base that cannot be represented in the target encoding.
14    AmbiguousBase,
15}
16
17impl std::fmt::Display for NucleotideConversionError {
18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19        write!(f, "{:?}", self)
20    }
21}
22
23impl std::error::Error for NucleotideConversionError {}
24
25const NCBI8NA_TO_IUPACNA: [u8; 16] = [
26    u8::MAX,
27    b'A',
28    b'C',
29    b'M',
30    b'G',
31    b'R',
32    b'S',
33    b'V',
34    b'T',
35    b'W',
36    b'Y',
37    b'H',
38    b'K',
39    b'D',
40    b'B',
41    b'N',
42];
43
44const fn generate_iupacna_to_ncbi8na() -> [u8; 256] {
45    let mut table = [u8::MAX; 256];
46    let mut i = 1;
47    while i < NCBI8NA_TO_IUPACNA.len() {
48        table[NCBI8NA_TO_IUPACNA[i] as usize] = i as u8;
49        table[NCBI8NA_TO_IUPACNA[i].to_ascii_lowercase() as usize] = i as u8;
50        i += 1;
51    }
52    table
53}
54const IUPACNA_TO_NCBI8NA: [u8; 256] = generate_iupacna_to_ncbi8na();
55
56const fn generate_ncbi8na_to_ncbi2na() -> [u8; 16] {
57    let mut table = [u8::MAX; 16];
58    let mut i = 0usize;
59    while i < 4 {
60        table[1 << i] = i as u8;
61        i += 1;
62    }
63    table
64}
65/// Table for resolve ncbi8na->ncbi2na mapping. Ambiguous mappings return u8::MAX, unambiguous
66/// mappings may return any other value. This is only intended to be used with Ncbi8naBase values
67/// which guarantees that the values would be within table bounds.
68const NCBI8NA_TO_NCBI2NA: [u8; 16] = generate_ncbi8na_to_ncbi2na();
69
70/// Single byte base representation where each of 4 bits represents an unambiguous base.
71///
72/// `count_zeros()` of each unambiguous base (ACGT) is 1; ambiguous bases are represented by setting
73/// bits corresponding to each of the unambiguous bases that it may be (N = 0xf). Note that in
74/// this representation 0 is invalid.
75#[repr(transparent)]
76#[derive(PartialEq, Eq, Copy, Clone, Debug)]
77pub struct Ncbi8naBase(u8);
78
79impl std::fmt::Display for Ncbi8naBase {
80    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81        write!(f, "{}", self.0)
82    }
83}
84
85/// Convert IUPACNA bases to Ncbi8na format.
86impl TryFrom<u8> for Ncbi8naBase {
87    type Error = NucleotideConversionError;
88
89    #[inline]
90    fn try_from(value: u8) -> Result<Ncbi8naBase, Self::Error> {
91        let b = IUPACNA_TO_NCBI8NA[value as usize];
92        if b != u8::MAX {
93            Ok(Ncbi8naBase(b))
94        } else {
95            Err(NucleotideConversionError::InvalidBase(value))
96        }
97    }
98}
99
100/// Convert Ncbi8na bases to IUPACNA format.
101impl From<Ncbi8naBase> for u8 {
102    #[inline]
103    fn from(value: Ncbi8naBase) -> Self {
104        NCBI8NA_TO_IUPACNA[value.0 as usize]
105    }
106}
107
108impl From<Ncbi8naBase> for char {
109    #[inline]
110    fn from(value: Ncbi8naBase) -> Self {
111        char::from(u8::from(value))
112    }
113}
114
115/// Single byte base representation that only consumes 2 bits per base.
116///
117/// This may only represent unambiguous bases (A, C, G, T/U).
118#[repr(transparent)]
119#[derive(PartialEq, Eq, Copy, Clone, Debug)]
120pub struct Ncbi2naBase(u8);
121
122impl TryFrom<Ncbi8naBase> for Ncbi2naBase {
123    type Error = NucleotideConversionError;
124
125    #[inline]
126    fn try_from(value: Ncbi8naBase) -> Result<Self, Self::Error> {
127        let b2 = NCBI8NA_TO_NCBI2NA[value.0 as usize];
128        if b2 != u8::MAX {
129            Ok(Ncbi2naBase(b2))
130        } else {
131            Err(NucleotideConversionError::AmbiguousBase)
132        }
133    }
134}
135
136impl From<Ncbi2naBase> for Ncbi8naBase {
137    #[inline]
138    fn from(value: Ncbi2naBase) -> Self {
139        Ncbi8naBase(1 << value.0)
140    }
141}
142
143impl From<Ncbi2naBase> for u8 {
144    #[inline]
145    fn from(value: Ncbi2naBase) -> Self {
146        u8::from(Ncbi8naBase::from(value))
147    }
148}
149
150impl From<Ncbi2naBase> for char {
151    #[inline]
152    fn from(value: Ncbi2naBase) -> Self {
153        char::from(Ncbi8naBase::from(value))
154    }
155}
156
157/// Single byte base representation that packs the unambiguous bases in values 0-3.
158///
159/// For unambiguous bases this is the same as `Ncbi2naBase`.
160#[repr(transparent)]
161#[derive(PartialEq, Eq, Copy, Clone, Debug)]
162pub struct Blast8naBase(u8);
163
164const NCBI8NA_TO_BLAST8NA: [u8; 16] = [
165    15, // invalid
166    0,  // A
167    1,  // C
168    6,  // M
169    2,  // G
170    4,  // R
171    9,  // S
172    13, // V
173    3,  // T
174    8,  // W
175    5,  // Y
176    12, // H
177    7,  // K
178    11, // D
179    10, // B
180    14, // N
181];
182const fn generate_blastna8_to_ncbi8na() -> [u8; 16] {
183    let mut table = [0u8; 16];
184    let mut i = 0usize;
185    while i < table.len() {
186        table[NCBI8NA_TO_BLAST8NA[i] as usize] = i as u8;
187        i += 1;
188    }
189    table
190}
191const BLASTNA8_TO_NCBI8NA: [u8; 16] = generate_blastna8_to_ncbi8na();
192
193impl From<Ncbi8naBase> for Blast8naBase {
194    fn from(value: Ncbi8naBase) -> Self {
195        Blast8naBase(NCBI8NA_TO_BLAST8NA[value.0 as usize])
196    }
197}
198
199impl From<Blast8naBase> for Ncbi8naBase {
200    fn from(value: Blast8naBase) -> Self {
201        Ncbi8naBase(BLASTNA8_TO_NCBI8NA[value.0 as usize])
202    }
203}
204
205const OLD_MAX_REGION_LEN: usize = 0xf;
206const OLD_MAX_OFFSET: usize = 0xffffff;
207const NEW_MAX_REGION_LEN: usize = 0xfff;
208
209/// An ambiguous nucleotide region.
210#[derive(Clone, Debug)]
211struct AmbiguousRegion {
212    range: Range<usize>,
213    base: Ncbi8naBase,
214}
215
216impl AmbiguousRegion {
217    fn to_old_format(&self) -> u32 {
218        debug_assert!(self.range.start <= OLD_MAX_OFFSET);
219        debug_assert!(self.range.len() <= OLD_MAX_REGION_LEN);
220        self.range.start as u32 | ((self.range.len() as u32) << 24) | ((self.base.0 as u32) << 28)
221    }
222
223    fn from_old_format(w: u32) -> Self {
224        let start = w as usize & OLD_MAX_OFFSET;
225        let end = start + ((w as usize >> 24) & OLD_MAX_REGION_LEN);
226        AmbiguousRegion {
227            range: start..end,
228            base: Ncbi8naBase((w >> 28) as u8),
229        }
230    }
231
232    fn to_new_format(&self) -> u64 {
233        debug_assert!(self.range.start <= u32::MAX as usize);
234        debug_assert!(self.range.len() <= NEW_MAX_REGION_LEN);
235        self.range.start as u64 | ((self.range.len() as u64) << 48) | ((self.base.0 as u64) << 60)
236    }
237
238    fn from_new_format(w0: u32, w1: u32) -> Self {
239        let start = w1 as usize;
240        let end = start + ((w0 as usize >> 16) & NEW_MAX_REGION_LEN);
241        AmbiguousRegion {
242            range: start..end,
243            base: Ncbi8naBase((w0 >> 28) as u8),
244        }
245    }
246
247    fn end_region(len: usize) -> Self {
248        AmbiguousRegion {
249            range: len..len,
250            base: Ncbi8naBase(0xf),
251        }
252    }
253}
254
255/// Encodes ambiguous regions of a nucleotide base sequence.
256///
257/// Bases are accepted in `Ncbi8naBase` format and each call to add returns an `Ncbi2naBase` that
258/// can be encoded in a compressed sequence. If the input base is unambiguous this returns the
259/// expected `Ncbi2naBase`; if the input base is ambiguous a random value will be chosen from the
260/// possible bases.
261///
262/// When all bases have been added to the encoder use Vec<u8>::from(AmbiguityEncoder) to
263/// obtain the encoded sequence data.
264struct AmbiguityEncoder {
265    regions: Vec<AmbiguousRegion>,
266    offset: usize,
267    longest_region: usize,
268    rng: SmallRng,
269}
270
271impl AmbiguityEncoder {
272    pub fn new(size_hint: usize) -> Self {
273        AmbiguityEncoder {
274            regions: vec![],
275            offset: 0,
276            longest_region: 0,
277            rng: SmallRng::seed_from_u64(size_hint as u64),
278        }
279    }
280
281    pub fn add(&mut self, base: Ncbi8naBase) -> Ncbi2naBase {
282        let mut base2 = NCBI8NA_TO_NCBI2NA[base.0 as usize];
283        if base2 == u8::MAX {
284            base2 = self.add_ambiguous(base);
285        }
286        self.offset += 1;
287        Ncbi2naBase(base2)
288    }
289
290    fn add_ambiguous(&mut self, base: Ncbi8naBase) -> u8 {
291        if let Some(last) = self
292            .regions
293            .last_mut()
294            .filter(|last| last.range.end == self.offset && last.base == base)
295        {
296            last.range.end += 1;
297            self.longest_region = std::cmp::max(self.longest_region, last.range.len());
298        } else {
299            self.regions.push(AmbiguousRegion {
300                range: self.offset..(self.offset + 1),
301                base,
302            })
303        }
304
305        // If the base value is 0 then this isn't a valid Ncbi8naBase value.
306        debug_assert_ne!(base.0, 0);
307        if base.0 == 0xf {
308            (self.rng.next_u32() & 0x3) as u8
309        } else {
310            let mut b = base.0;
311            let mut select = self.rng.next_u32() % base.0.count_ones();
312            while select > 0 {
313                b &= (1 << b.trailing_zeros()) - 1;
314                select -= 1;
315            }
316            b.trailing_zeros() as u8
317        }
318    }
319
320    fn use_new_format(&self) -> bool {
321        self.offset > OLD_MAX_OFFSET || self.longest_region > OLD_MAX_REGION_LEN
322    }
323
324    fn into_new_format(self) -> Vec<u8> {
325        let mut encoded =
326            Vec::with_capacity((self.regions.len() * 2 + 1) * std::mem::size_of::<u32>());
327        let hdr = ((self.regions.len() as u32) * 2) | (1u32 << 31);
328        encoded.extend_from_slice(&hdr.to_be_bytes());
329        for mut region in self.regions {
330            while region.range.len() > NEW_MAX_REGION_LEN {
331                let subr = AmbiguousRegion {
332                    range: region.range.start..(region.range.start + NEW_MAX_REGION_LEN),
333                    base: region.base,
334                };
335                encoded.extend_from_slice(&subr.to_new_format().to_be_bytes());
336                region.range.start += NEW_MAX_REGION_LEN;
337            }
338            encoded.extend_from_slice(&region.to_new_format().to_be_bytes())
339        }
340        encoded
341    }
342
343    fn into_old_format(self) -> Vec<u8> {
344        let mut encoded = Vec::with_capacity((self.regions.len() + 1) * std::mem::size_of::<u32>());
345        encoded.extend_from_slice(&(self.regions.len() as u32).to_be_bytes());
346        for region in self.regions {
347            encoded.extend_from_slice(&region.to_old_format().to_be_bytes())
348        }
349        encoded
350    }
351}
352
353impl From<AmbiguityEncoder> for Vec<u8> {
354    fn from(value: AmbiguityEncoder) -> Vec<u8> {
355        if value.regions.is_empty() {
356            vec![]
357        } else if value.use_new_format() {
358            value.into_new_format()
359        } else {
360            value.into_old_format()
361        }
362    }
363}
364
365struct AmbiguityIterator<'a> {
366    amb: &'a [u8],
367    new_format: bool,
368}
369
370impl<'a> AmbiguityIterator<'a> {
371    fn new(mut amb: &'a [u8]) -> Self {
372        // round down len of amb to the nearest value divisible by size_of::<u32>()
373        amb = &amb[..(amb.len() & !0x3)];
374        let mut it = Self {
375            amb,
376            new_format: false,
377        };
378        let new_format = it.read_u32().filter(|h| h & (1 << 31) > 0).is_some();
379        it.new_format = new_format;
380        it
381    }
382
383    #[inline]
384    fn read_u32(&mut self) -> Option<u32> {
385        if !self.amb.is_empty() {
386            let (u32_bytes, rest) = self.amb.split_at(std::mem::size_of::<u32>());
387            self.amb = rest;
388            Some(u32::from_be_bytes(u32_bytes.try_into().unwrap()))
389        } else {
390            None
391        }
392    }
393}
394
395impl<'a> Iterator for AmbiguityIterator<'a> {
396    type Item = AmbiguousRegion;
397
398    #[inline]
399    fn next(&mut self) -> Option<Self::Item> {
400        let w0 = self.read_u32()?;
401        if self.new_format {
402            let w1 = self.read_u32()?;
403            Some(AmbiguousRegion::from_new_format(w0, w1))
404        } else {
405            Some(AmbiguousRegion::from_old_format(w0))
406        }
407    }
408}
409
410/// BLAST database representation of a nucleotide sequence.
411///
412/// This representation is divided into two different parts:
413/// * Sequence encoding packed down to 2 bits per base. Ambiguous bases are resolved at random.
414/// * Description of the ranges containing ambiguous bases.
415///
416/// This can easily converted to and from a FASTA string:
417/// ```
418/// use blastdb_sequence_util::{Ncbi8naBase, NucleotideSequence};
419/// let fasta = "acgtACGT";
420/// let seq: NucleotideSequence = fasta.as_bytes().iter().map(|b| Ncbi8naBase::try_from(*b)).collect::<Result<_, _>>().unwrap();
421/// let normalized_fasta: String = seq.iter().map(|b| char::from(b)).collect();  // ACGTACGT
422/// ```
423pub struct NucleotideSequence {
424    seq: Vec<u8>,
425    amb: Vec<u8>,
426}
427
428impl NucleotideSequence {
429    /// Create a `NucleotideSequence` from sequence and ambiguity streams.
430    pub fn new(seq: Vec<u8>, amb: Vec<u8>) -> NucleotideSequence {
431        Self { seq, amb }
432    }
433
434    /// Return the number of bases in the sequence.
435    pub fn len(&self) -> usize {
436        if let Some(b) = self.seq.last() {
437            (self.seq.len() - 1) * 4 + (*b as usize & 0x3)
438        } else {
439            0
440        }
441    }
442
443    pub fn is_empty(&self) -> bool {
444        self.len() == 0
445    }
446
447    /// Iterate over Ncbi8na representation of sequence data.
448    pub fn iter(&self) -> SequenceIter<'_, '_> {
449        SequenceIter::new(&self.seq, &self.amb)
450    }
451
452    /// Return the raw sequence bytes for storage.
453    pub fn sequence_bytes(&self) -> &[u8] {
454        &self.seq
455    }
456
457    /// Return the raw ambiguity bytes for storage.
458    pub fn ambiguity_bytes(&self) -> &[u8] {
459        &self.amb
460    }
461}
462
463const NCBI2NA_SHIFT: [u8; 4] = [6, 4, 2, 0];
464
465impl FromIterator<Ncbi8naBase> for NucleotideSequence {
466    fn from_iter<T>(into_iter: T) -> Self
467    where
468        T: IntoIterator<Item = Ncbi8naBase>,
469    {
470        let iter = into_iter.into_iter();
471        let (lower, upper) = iter.size_hint();
472        let hint = upper.unwrap_or(lower);
473        let mut seq = Vec::with_capacity((hint / 4) + 1);
474        let mut amb = AmbiguityEncoder::new(hint);
475        let (mut buf, mut nbuf) = (0u8, 0usize);
476        for base in iter {
477            let base2 = amb.add(base);
478            buf |= base2.0 << NCBI2NA_SHIFT[nbuf & 0x3];
479            nbuf += 1;
480            if nbuf == 4 {
481                seq.push(buf);
482                buf = 0;
483                nbuf = 0;
484            }
485        }
486        buf |= (nbuf & 0x3) as u8;
487        seq.push(buf);
488        NucleotideSequence {
489            seq,
490            amb: amb.into(),
491        }
492    }
493}
494
495impl std::str::FromStr for NucleotideSequence {
496    type Err = NucleotideConversionError;
497
498    fn from_str(s: &str) -> Result<Self, Self::Err> {
499        s.as_bytes()
500            .iter()
501            .map(|b| Ncbi8naBase::try_from(*b))
502            .collect::<Result<_, _>>()
503    }
504}
505
506struct CompressedSequenceIter<'a> {
507    seq: &'a [u8],
508    len: usize,
509    i: usize,
510}
511
512impl<'a> CompressedSequenceIter<'a> {
513    fn new(seq: &'a [u8]) -> Self {
514        let rem = seq.last().copied().unwrap_or(0) as usize & 0x3;
515        let len = (seq.len() - 1) * 4 + rem;
516        Self { seq, len, i: 0 }
517    }
518}
519
520impl<'a> Iterator for CompressedSequenceIter<'a> {
521    type Item = (usize, Ncbi2naBase);
522
523    #[inline]
524    fn next(&mut self) -> Option<Self::Item> {
525        if self.i < self.len {
526            let i = self.i;
527            let raw_base = (self.seq[i / 4] >> NCBI2NA_SHIFT[i & 3]) & 3;
528            self.i += 1;
529            Some((i, Ncbi2naBase(raw_base)))
530        } else {
531            None
532        }
533    }
534
535    #[inline]
536    fn nth(&mut self, n: usize) -> Option<Self::Item> {
537        self.i += n;
538        self.next()
539    }
540
541    #[inline]
542    fn size_hint(&self) -> (usize, Option<usize>) {
543        (self.len, Some(self.len))
544    }
545}
546
547impl<'a> ExactSizeIterator for CompressedSequenceIter<'a> {}
548
549pub struct SequenceIter<'a, 'b> {
550    seq_it: CompressedSequenceIter<'a>,
551    amb_it: AmbiguityIterator<'b>,
552    amb_region: AmbiguousRegion,
553}
554
555impl<'a, 'b> SequenceIter<'a, 'b> {
556    fn new(seq: &'a [u8], amb: &'b [u8]) -> Self {
557        let seq_it = CompressedSequenceIter::new(seq);
558        let mut amb_it = AmbiguityIterator::new(amb);
559        let amb_region = amb_it
560            .next()
561            .unwrap_or_else(|| AmbiguousRegion::end_region(seq_it.len()));
562        Self {
563            seq_it,
564            amb_it,
565            amb_region,
566        }
567    }
568
569    fn next_region(&mut self) -> AmbiguousRegion {
570        self.amb_it
571            .next()
572            .unwrap_or_else(|| AmbiguousRegion::end_region(self.seq_it.len()))
573    }
574}
575
576impl<'a, 'b> Iterator for SequenceIter<'a, 'b> {
577    type Item = Ncbi8naBase;
578
579    #[inline]
580    fn next(&mut self) -> Option<Self::Item> {
581        #[allow(clippy::iter_nth_zero)]
582        self.nth(0)
583    }
584
585    #[inline]
586    fn nth(&mut self, n: usize) -> Option<Self::Item> {
587        let (i, base2) = self.seq_it.nth(n)?;
588        while i >= self.amb_region.range.end {
589            self.amb_region = self.next_region();
590        }
591        if i < self.amb_region.range.start {
592            Some(base2.into())
593        } else {
594            Some(self.amb_region.base)
595        }
596    }
597
598    #[inline]
599    fn size_hint(&self) -> (usize, Option<usize>) {
600        self.seq_it.size_hint()
601    }
602}
603
604impl<'a, 'b> ExactSizeIterator for SequenceIter<'a, 'b> {}
605
606/// Render the sequence as an IUPAC nucleic acide sequence.
607impl std::fmt::Display for NucleotideSequence {
608    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
609        write!(f, "{}", self.iter().map(char::from).collect::<String>())
610    }
611}
612
613#[cfg(test)]
614mod test {
615    use super::{Ncbi2naBase, Ncbi8naBase, NucleotideConversionError, NucleotideSequence};
616
617    #[test]
618    fn convert_iupacna_ncbi8na() {
619        assert_eq!(u8::from(Ncbi8naBase::try_from(b'A').unwrap()), b'A');
620        assert_eq!(u8::from(Ncbi8naBase::try_from(b'A').unwrap()), b'A');
621        assert_eq!(u8::from(Ncbi8naBase::try_from(b'G').unwrap()), b'G');
622        assert_eq!(u8::from(Ncbi8naBase::try_from(b'N').unwrap()), b'N');
623        assert_eq!(
624            Ncbi8naBase::try_from(b'E'),
625            Err(NucleotideConversionError::InvalidBase(b'E'))
626        );
627    }
628
629    #[test]
630    fn convert_iupacna_ncbi2na() {
631        assert_eq!(
632            u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'A').unwrap()).unwrap()),
633            b'A'
634        );
635        assert_eq!(
636            u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'C').unwrap()).unwrap()),
637            b'C'
638        );
639        assert_eq!(
640            u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'G').unwrap()).unwrap()),
641            b'G'
642        );
643        assert_eq!(
644            u8::from(Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'T').unwrap()).unwrap()),
645            b'T'
646        );
647        assert_eq!(
648            Ncbi2naBase::try_from(Ncbi8naBase::try_from(b'D').unwrap()),
649            Err(NucleotideConversionError::AmbiguousBase)
650        );
651    }
652
653    fn test_sequence_conversion(
654        iupac_seq: &str,
655        expected_seq_bytes: usize,
656        expected_amb_words: usize,
657    ) {
658        let seq = iupac_seq
659            .as_bytes()
660            .into_iter()
661            .map(|b| Ncbi8naBase::try_from(*b))
662            .collect::<Result<NucleotideSequence, _>>()
663            .unwrap();
664        assert_eq!(seq.len(), iupac_seq.len());
665        assert_eq!(seq.sequence_bytes().len(), expected_seq_bytes);
666        assert_eq!(
667            seq.ambiguity_bytes().len() / std::mem::size_of::<u32>(),
668            expected_amb_words
669        );
670        assert_eq!(seq.to_string(), iupac_seq);
671    }
672
673    #[test]
674    fn simple_sequence() {
675        test_sequence_conversion("ACGTACGT", 3, 0);
676    }
677
678    #[test]
679    fn odd_base_sequence() {
680        test_sequence_conversion("ACGTA", 2, 0);
681    }
682
683    #[test]
684    fn completely_ambiguous_old_format() {
685        test_sequence_conversion("NNNNNNNN", 3, 2);
686    }
687
688    #[test]
689    fn completely_ambiguous_new_format() {
690        test_sequence_conversion("NNNNNNNNNNNNNNNN", 5, 3);
691    }
692
693    #[test]
694    fn some_ambiguous() {
695        test_sequence_conversion("AAGNCAGTNNGGCNTA", 5, 4);
696    }
697
698    #[test]
699    fn mixed_ambiguous() {
700        test_sequence_conversion("AAGYCAGTNMGGCNTA", 5, 5);
701    }
702}