Skip to main content

cyanea_seq/
restriction.rs

1//! Restriction enzyme recognition, cut-site finding, and in-silico digestion.
2//!
3//! Provides a database of common restriction enzymes and functions to
4//! find cut sites, digest sequences, and compute fragment sizes.
5
6/// A restriction enzyme with its recognition site and cut offsets.
7#[derive(Debug, Clone)]
8pub struct RestrictionEnzyme {
9    /// Enzyme name (e.g., "EcoRI").
10    pub name: String,
11    /// Recognition site in IUPAC DNA (uppercase).
12    pub recognition_site: Vec<u8>,
13    /// Cut offset on the forward (5'→3') strand, from the start of the recognition site.
14    pub cut_forward: isize,
15    /// Cut offset on the reverse (3'→5') strand, from the start of the recognition site.
16    pub cut_reverse: isize,
17    /// Whether the recognition site is palindromic.
18    pub is_palindromic: bool,
19}
20
21/// A located cut site on a sequence.
22#[derive(Debug, Clone)]
23pub struct CutSite {
24    /// Position in the original sequence where the forward strand is cut.
25    pub position: usize,
26    /// Name of the enzyme that cuts here.
27    pub enzyme: String,
28    /// Type of overhang produced.
29    pub overhang: Overhang,
30}
31
32/// Overhang type produced by a restriction enzyme cut.
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub enum Overhang {
35    /// 5' overhang (sticky end).
36    FivePrime(Vec<u8>),
37    /// 3' overhang (sticky end).
38    ThreePrime(Vec<u8>),
39    /// Blunt end (no overhang).
40    Blunt,
41}
42
43/// A fragment produced by restriction digestion.
44#[derive(Debug, Clone)]
45pub struct Fragment {
46    /// Start position in the original sequence (inclusive).
47    pub start: usize,
48    /// End position in the original sequence (exclusive).
49    pub end: usize,
50    /// Fragment length.
51    pub length: usize,
52}
53
54/// Return a curated set of ~20 common restriction enzymes.
55pub fn common_enzymes() -> Vec<RestrictionEnzyme> {
56    vec![
57        enzyme("EcoRI", b"GAATTC", 1, 5, true),
58        enzyme("BamHI", b"GGATCC", 1, 5, true),
59        enzyme("HindIII", b"AAGCTT", 1, 5, true),
60        enzyme("NotI", b"GCGGCCGC", 2, 6, true),
61        enzyme("XhoI", b"CTCGAG", 1, 5, true),
62        enzyme("SalI", b"GTCGAC", 1, 5, true),
63        enzyme("BglII", b"AGATCT", 1, 5, true),
64        enzyme("NcoI", b"CCATGG", 1, 5, true),
65        enzyme("NdeI", b"CATATG", 2, 4, true),
66        enzyme("XbaI", b"TCTAGA", 1, 5, true),
67        enzyme("SpeI", b"ACTAGT", 1, 5, true),
68        enzyme("KpnI", b"GGTACC", 5, 1, true),
69        enzyme("SacI", b"GAGCTC", 5, 1, true),
70        enzyme("PstI", b"CTGCAG", 5, 1, true),
71        enzyme("SphI", b"GCATGC", 5, 1, true),
72        enzyme("ApaI", b"GGGCCC", 5, 1, true),
73        enzyme("EcoRV", b"GATATC", 3, 3, true),
74        enzyme("SmaI", b"CCCGGG", 3, 3, true),
75        enzyme("HpaI", b"GTTAAC", 3, 3, true),
76        enzyme("ScaI", b"AGTACT", 3, 3, true),
77    ]
78}
79
80fn enzyme(name: &str, site: &[u8], fwd: isize, rev: isize, pal: bool) -> RestrictionEnzyme {
81    RestrictionEnzyme {
82        name: name.to_string(),
83        recognition_site: site.to_vec(),
84        cut_forward: fwd,
85        cut_reverse: rev,
86        is_palindromic: pal,
87    }
88}
89
90/// Check if a single base matches an IUPAC degenerate code.
91fn iupac_matches(code: u8, base: u8) -> bool {
92    let base_upper = base.to_ascii_uppercase();
93    match code.to_ascii_uppercase() {
94        b'A' => base_upper == b'A',
95        b'C' => base_upper == b'C',
96        b'G' => base_upper == b'G',
97        b'T' => base_upper == b'T',
98        b'R' => matches!(base_upper, b'A' | b'G'),
99        b'Y' => matches!(base_upper, b'C' | b'T'),
100        b'M' => matches!(base_upper, b'A' | b'C'),
101        b'K' => matches!(base_upper, b'G' | b'T'),
102        b'S' => matches!(base_upper, b'G' | b'C'),
103        b'W' => matches!(base_upper, b'A' | b'T'),
104        b'H' => matches!(base_upper, b'A' | b'C' | b'T'),
105        b'B' => matches!(base_upper, b'C' | b'G' | b'T'),
106        b'V' => matches!(base_upper, b'A' | b'C' | b'G'),
107        b'D' => matches!(base_upper, b'A' | b'G' | b'T'),
108        b'N' => matches!(base_upper, b'A' | b'C' | b'G' | b'T'),
109        _ => false,
110    }
111}
112
113/// Find all cut sites for a single enzyme on the given sequence.
114///
115/// Scans the sequence for IUPAC-aware matches of the recognition site.
116pub fn find_cut_sites(seq: &[u8], enzyme: &RestrictionEnzyme) -> Vec<CutSite> {
117    let site_len = enzyme.recognition_site.len();
118    if seq.len() < site_len {
119        return Vec::new();
120    }
121
122    let mut sites = Vec::new();
123    for i in 0..=seq.len() - site_len {
124        let matches = enzyme
125            .recognition_site
126            .iter()
127            .zip(&seq[i..i + site_len])
128            .all(|(&code, &base)| iupac_matches(code, base));
129
130        if matches {
131            let cut_pos = i as isize + enzyme.cut_forward;
132            let cut_pos = cut_pos.max(0) as usize;
133
134            let overhang = if enzyme.cut_forward == enzyme.cut_reverse {
135                Overhang::Blunt
136            } else if enzyme.cut_forward < enzyme.cut_reverse {
137                // 5' overhang
138                let start = enzyme.cut_forward.max(0) as usize;
139                let end = (enzyme.cut_reverse as usize).min(site_len);
140                let oh = enzyme.recognition_site[start..end].to_vec();
141                Overhang::FivePrime(oh)
142            } else {
143                // 3' overhang
144                let start = enzyme.cut_reverse.max(0) as usize;
145                let end = (enzyme.cut_forward as usize).min(site_len);
146                let oh = enzyme.recognition_site[start..end].to_vec();
147                Overhang::ThreePrime(oh)
148            };
149
150            sites.push(CutSite {
151                position: cut_pos,
152                enzyme: enzyme.name.clone(),
153                overhang,
154            });
155        }
156    }
157    sites
158}
159
160/// Digest a sequence with one or more enzymes, returning the resulting fragments.
161///
162/// Fragments are returned sorted by position. The first fragment starts at
163/// position 0 and the last ends at the sequence length.
164pub fn digest(seq: &[u8], enzymes: &[&RestrictionEnzyme]) -> Vec<Fragment> {
165    let mut cut_positions: Vec<usize> = Vec::new();
166    for enz in enzymes {
167        for site in find_cut_sites(seq, enz) {
168            cut_positions.push(site.position);
169        }
170    }
171    cut_positions.sort_unstable();
172    cut_positions.dedup();
173
174    // Build fragments between consecutive cut positions.
175    let mut fragments = Vec::new();
176    let mut prev = 0usize;
177    for &pos in &cut_positions {
178        if pos > prev && pos <= seq.len() {
179            fragments.push(Fragment {
180                start: prev,
181                end: pos,
182                length: pos - prev,
183            });
184            prev = pos;
185        }
186    }
187    // Final fragment from last cut to end.
188    if prev < seq.len() {
189        fragments.push(Fragment {
190            start: prev,
191            end: seq.len(),
192            length: seq.len() - prev,
193        });
194    }
195    fragments
196}
197
198/// Compute fragment sizes from digestion with one or more enzymes.
199pub fn fragment_sizes(seq: &[u8], enzymes: &[&RestrictionEnzyme]) -> Vec<usize> {
200    digest(seq, enzymes)
201        .into_iter()
202        .map(|f| f.length)
203        .collect()
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn ecori_cuts_correctly() {
212        // EcoRI recognizes GAATTC, cuts between G and AATTC (offset 1).
213        let seq = b"AAAGAATTCAAA";
214        let ecori = &common_enzymes().into_iter().find(|e| e.name == "EcoRI").unwrap();
215        let sites = find_cut_sites(seq, ecori);
216        assert_eq!(sites.len(), 1);
217        assert_eq!(sites[0].position, 4); // 3 + 1
218        assert!(matches!(sites[0].overhang, Overhang::FivePrime(_)));
219    }
220
221    #[test]
222    fn blunt_end_enzyme() {
223        // EcoRV: GATATC, cuts at 3/3 → blunt.
224        let seq = b"AAAGATATCAAA";
225        let ecorv = &common_enzymes().into_iter().find(|e| e.name == "EcoRV").unwrap();
226        let sites = find_cut_sites(seq, ecorv);
227        assert_eq!(sites.len(), 1);
228        assert_eq!(sites[0].overhang, Overhang::Blunt);
229    }
230
231    #[test]
232    fn no_sites_single_fragment() {
233        let seq = b"AAAAAAAAAA";
234        let ecori = &common_enzymes().into_iter().find(|e| e.name == "EcoRI").unwrap();
235        let fragments = digest(seq, &[ecori]);
236        assert_eq!(fragments.len(), 1);
237        assert_eq!(fragments[0].length, 10);
238    }
239
240    #[test]
241    fn iupac_degenerate_match() {
242        // Create an enzyme with degenerate site: GAANTC (N matches any).
243        let enz = RestrictionEnzyme {
244            name: "TestEnz".into(),
245            recognition_site: b"GAANTC".to_vec(),
246            cut_forward: 1,
247            cut_reverse: 5,
248            is_palindromic: true,
249        };
250        // GAATTC should match (N=T), GAACTC should also match (N=C).
251        assert_eq!(find_cut_sites(b"GAATTC", &enz).len(), 1);
252        assert_eq!(find_cut_sites(b"GAACTC", &enz).len(), 1);
253        assert_eq!(find_cut_sites(b"GAAATC", &enz).len(), 1);
254        assert_eq!(find_cut_sites(b"GAAGTC", &enz).len(), 1);
255    }
256
257    #[test]
258    fn common_enzymes_nonempty() {
259        let enzymes = common_enzymes();
260        assert_eq!(enzymes.len(), 20);
261        for e in &enzymes {
262            assert!(!e.name.is_empty());
263            assert!(!e.recognition_site.is_empty());
264        }
265    }
266}