1#[derive(Debug, Clone)]
8pub struct RestrictionEnzyme {
9 pub name: String,
11 pub recognition_site: Vec<u8>,
13 pub cut_forward: isize,
15 pub cut_reverse: isize,
17 pub is_palindromic: bool,
19}
20
21#[derive(Debug, Clone)]
23pub struct CutSite {
24 pub position: usize,
26 pub enzyme: String,
28 pub overhang: Overhang,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
34pub enum Overhang {
35 FivePrime(Vec<u8>),
37 ThreePrime(Vec<u8>),
39 Blunt,
41}
42
43#[derive(Debug, Clone)]
45pub struct Fragment {
46 pub start: usize,
48 pub end: usize,
50 pub length: usize,
52}
53
54pub fn common_enzymes() -> Vec<RestrictionEnzyme> {
56 vec![
57 enzyme("EcoRI", b"GAATTC", 1, 5, true),
58 enzyme("BamHI", b"GGATCC", 1, 5, true),
59 enzyme("HindIII", b"AAGCTT", 1, 5, true),
60 enzyme("NotI", b"GCGGCCGC", 2, 6, true),
61 enzyme("XhoI", b"CTCGAG", 1, 5, true),
62 enzyme("SalI", b"GTCGAC", 1, 5, true),
63 enzyme("BglII", b"AGATCT", 1, 5, true),
64 enzyme("NcoI", b"CCATGG", 1, 5, true),
65 enzyme("NdeI", b"CATATG", 2, 4, true),
66 enzyme("XbaI", b"TCTAGA", 1, 5, true),
67 enzyme("SpeI", b"ACTAGT", 1, 5, true),
68 enzyme("KpnI", b"GGTACC", 5, 1, true),
69 enzyme("SacI", b"GAGCTC", 5, 1, true),
70 enzyme("PstI", b"CTGCAG", 5, 1, true),
71 enzyme("SphI", b"GCATGC", 5, 1, true),
72 enzyme("ApaI", b"GGGCCC", 5, 1, true),
73 enzyme("EcoRV", b"GATATC", 3, 3, true),
74 enzyme("SmaI", b"CCCGGG", 3, 3, true),
75 enzyme("HpaI", b"GTTAAC", 3, 3, true),
76 enzyme("ScaI", b"AGTACT", 3, 3, true),
77 ]
78}
79
80fn enzyme(name: &str, site: &[u8], fwd: isize, rev: isize, pal: bool) -> RestrictionEnzyme {
81 RestrictionEnzyme {
82 name: name.to_string(),
83 recognition_site: site.to_vec(),
84 cut_forward: fwd,
85 cut_reverse: rev,
86 is_palindromic: pal,
87 }
88}
89
90fn iupac_matches(code: u8, base: u8) -> bool {
92 let base_upper = base.to_ascii_uppercase();
93 match code.to_ascii_uppercase() {
94 b'A' => base_upper == b'A',
95 b'C' => base_upper == b'C',
96 b'G' => base_upper == b'G',
97 b'T' => base_upper == b'T',
98 b'R' => matches!(base_upper, b'A' | b'G'),
99 b'Y' => matches!(base_upper, b'C' | b'T'),
100 b'M' => matches!(base_upper, b'A' | b'C'),
101 b'K' => matches!(base_upper, b'G' | b'T'),
102 b'S' => matches!(base_upper, b'G' | b'C'),
103 b'W' => matches!(base_upper, b'A' | b'T'),
104 b'H' => matches!(base_upper, b'A' | b'C' | b'T'),
105 b'B' => matches!(base_upper, b'C' | b'G' | b'T'),
106 b'V' => matches!(base_upper, b'A' | b'C' | b'G'),
107 b'D' => matches!(base_upper, b'A' | b'G' | b'T'),
108 b'N' => matches!(base_upper, b'A' | b'C' | b'G' | b'T'),
109 _ => false,
110 }
111}
112
113pub fn find_cut_sites(seq: &[u8], enzyme: &RestrictionEnzyme) -> Vec<CutSite> {
117 let site_len = enzyme.recognition_site.len();
118 if seq.len() < site_len {
119 return Vec::new();
120 }
121
122 let mut sites = Vec::new();
123 for i in 0..=seq.len() - site_len {
124 let matches = enzyme
125 .recognition_site
126 .iter()
127 .zip(&seq[i..i + site_len])
128 .all(|(&code, &base)| iupac_matches(code, base));
129
130 if matches {
131 let cut_pos = i as isize + enzyme.cut_forward;
132 let cut_pos = cut_pos.max(0) as usize;
133
134 let overhang = if enzyme.cut_forward == enzyme.cut_reverse {
135 Overhang::Blunt
136 } else if enzyme.cut_forward < enzyme.cut_reverse {
137 let start = enzyme.cut_forward.max(0) as usize;
139 let end = (enzyme.cut_reverse as usize).min(site_len);
140 let oh = enzyme.recognition_site[start..end].to_vec();
141 Overhang::FivePrime(oh)
142 } else {
143 let start = enzyme.cut_reverse.max(0) as usize;
145 let end = (enzyme.cut_forward as usize).min(site_len);
146 let oh = enzyme.recognition_site[start..end].to_vec();
147 Overhang::ThreePrime(oh)
148 };
149
150 sites.push(CutSite {
151 position: cut_pos,
152 enzyme: enzyme.name.clone(),
153 overhang,
154 });
155 }
156 }
157 sites
158}
159
160pub fn digest(seq: &[u8], enzymes: &[&RestrictionEnzyme]) -> Vec<Fragment> {
165 let mut cut_positions: Vec<usize> = Vec::new();
166 for enz in enzymes {
167 for site in find_cut_sites(seq, enz) {
168 cut_positions.push(site.position);
169 }
170 }
171 cut_positions.sort_unstable();
172 cut_positions.dedup();
173
174 let mut fragments = Vec::new();
176 let mut prev = 0usize;
177 for &pos in &cut_positions {
178 if pos > prev && pos <= seq.len() {
179 fragments.push(Fragment {
180 start: prev,
181 end: pos,
182 length: pos - prev,
183 });
184 prev = pos;
185 }
186 }
187 if prev < seq.len() {
189 fragments.push(Fragment {
190 start: prev,
191 end: seq.len(),
192 length: seq.len() - prev,
193 });
194 }
195 fragments
196}
197
198pub fn fragment_sizes(seq: &[u8], enzymes: &[&RestrictionEnzyme]) -> Vec<usize> {
200 digest(seq, enzymes)
201 .into_iter()
202 .map(|f| f.length)
203 .collect()
204}
205
206#[cfg(test)]
207mod tests {
208 use super::*;
209
210 #[test]
211 fn ecori_cuts_correctly() {
212 let seq = b"AAAGAATTCAAA";
214 let ecori = &common_enzymes().into_iter().find(|e| e.name == "EcoRI").unwrap();
215 let sites = find_cut_sites(seq, ecori);
216 assert_eq!(sites.len(), 1);
217 assert_eq!(sites[0].position, 4); assert!(matches!(sites[0].overhang, Overhang::FivePrime(_)));
219 }
220
221 #[test]
222 fn blunt_end_enzyme() {
223 let seq = b"AAAGATATCAAA";
225 let ecorv = &common_enzymes().into_iter().find(|e| e.name == "EcoRV").unwrap();
226 let sites = find_cut_sites(seq, ecorv);
227 assert_eq!(sites.len(), 1);
228 assert_eq!(sites[0].overhang, Overhang::Blunt);
229 }
230
231 #[test]
232 fn no_sites_single_fragment() {
233 let seq = b"AAAAAAAAAA";
234 let ecori = &common_enzymes().into_iter().find(|e| e.name == "EcoRI").unwrap();
235 let fragments = digest(seq, &[ecori]);
236 assert_eq!(fragments.len(), 1);
237 assert_eq!(fragments[0].length, 10);
238 }
239
240 #[test]
241 fn iupac_degenerate_match() {
242 let enz = RestrictionEnzyme {
244 name: "TestEnz".into(),
245 recognition_site: b"GAANTC".to_vec(),
246 cut_forward: 1,
247 cut_reverse: 5,
248 is_palindromic: true,
249 };
250 assert_eq!(find_cut_sites(b"GAATTC", &enz).len(), 1);
252 assert_eq!(find_cut_sites(b"GAACTC", &enz).len(), 1);
253 assert_eq!(find_cut_sites(b"GAAATC", &enz).len(), 1);
254 assert_eq!(find_cut_sites(b"GAAGTC", &enz).len(), 1);
255 }
256
257 #[test]
258 fn common_enzymes_nonempty() {
259 let enzymes = common_enzymes();
260 assert_eq!(enzymes.len(), 20);
261 for e in &enzymes {
262 assert!(!e.name.is_empty());
263 assert!(!e.recognition_site.is_empty());
264 }
265 }
266}