Skip to main content

diced_py/
lib.rs

1#![doc = include_str!("../README.md")]
2
3extern crate diced;
4extern crate pyo3;
5
6use pyo3::exceptions::PyIndexError;
7use pyo3::prelude::*;
8use pyo3::pybacked::PyBackedStr;
9use pyo3::types::PySlice;
10use pyo3::types::PyString;
11
12/// The actual storage for the sequence data.
13#[derive(Debug)]
14struct Sequence {
15    data: PyBackedStr,
16}
17
18impl From<PyBackedStr> for Sequence {
19    fn from(data: PyBackedStr) -> Self {
20        Self { data }
21    }
22}
23
24impl AsRef<str> for Sequence {
25    fn as_ref(&self) -> &str {
26        self.data.as_ref()
27    }
28}
29
30impl Clone for Sequence {
31    fn clone(&self) -> Self {
32        Self {
33            data: Python::attach(|py| self.data.clone_ref(py)),
34        }
35    }
36}
37
38/// A sequence region.
39#[pyclass(module = "diced.lib", frozen, subclass)]
40pub struct Region {
41    region: diced::Region<Sequence>,
42}
43
44#[pymethods]
45impl Region {
46    #[new]
47    pub fn __new__<'py>(
48        py: Python<'py>,
49        sequence: PyBackedStr,
50        start: usize,
51        end: usize,
52    ) -> PyResult<PyClassInitializer<Self>> {
53        if start > end || start > sequence.len() || end > sequence.len() {
54            let s = PySlice::new(py, start as isize, end as isize, 1).unbind();
55            return Err(PyIndexError::new_err((s,)));
56        }
57        Ok(Region {
58            region: diced::Region::new(Sequence::from(sequence), start, end),
59        }
60        .into())
61    }
62
63    /// `int`: The start coordinate of the region (zero-based).
64    #[getter]
65    pub fn start(&self) -> usize {
66        self.region.start()
67    }
68
69    /// `int`: The end coordinate of the region (zero-based, exclusive).
70    #[getter]
71    pub fn end(&self) -> usize {
72        self.region.end()
73    }
74
75    /// Get the sequence region as a string.
76    pub fn __str__<'py>(&self, py: Python<'py>) -> Bound<'py, PyString> {
77        PyString::new(py, self.region.as_str())
78    }
79}
80
81/// A CRISPR repeat.
82#[pyclass(module="diced.lib", extends=Region)]
83pub struct Repeat {}
84
85#[pymethods]
86impl Repeat {
87    #[new]
88    pub fn __new__<'py>(
89        py: Python<'py>,
90        sequence: PyBackedStr,
91        start: usize,
92        end: usize,
93    ) -> PyResult<PyClassInitializer<Self>> {
94        Region::__new__(py, sequence, start, end).map(|r| r.add_subclass(Repeat {}))
95    }
96}
97
98/// A list of repeats inside a CRISPR region.
99#[pyclass(module = "diced.lib", sequence)]
100pub struct Repeats {
101    crispr: Py<Crispr>,
102}
103
104#[pymethods]
105impl Repeats {
106    pub fn __len__<'py>(&self, py: Python<'py>) -> usize {
107        self.crispr.borrow(py).crispr.len()
108    }
109
110    pub fn __getitem__<'py>(&self, py: Python<'py>, index: usize) -> PyResult<Py<Repeat>> {
111        self.crispr
112            .bind(py)
113            .borrow()
114            .crispr
115            .repeats()
116            .nth(index)
117            .ok_or(PyIndexError::new_err(index))
118            .and_then(|region| {
119                Py::new(
120                    py,
121                    PyClassInitializer::from(Region { region }).add_subclass(Repeat {}),
122                )
123            })
124    }
125}
126
127/// A CRISPR spacer.
128#[pyclass(module="diced.lib", extends=Region)]
129pub struct Spacer {}
130
131#[pymethods]
132impl Spacer {
133    #[new]
134    pub fn __new__<'py>(
135        py: Python<'py>,
136        sequence: PyBackedStr,
137        start: usize,
138        end: usize,
139    ) -> PyResult<PyClassInitializer<Self>> {
140        Region::__new__(py, sequence, start, end).map(|r| r.add_subclass(Spacer {}))
141    }
142}
143
144/// A list of spacers inside a CRISPR region.
145#[pyclass(module = "diced.lib", sequence)]
146pub struct Spacers {
147    crispr: Py<Crispr>,
148}
149
150#[pymethods]
151impl Spacers {
152    pub fn __len__<'py>(&self, py: Python<'py>) -> usize {
153        self.crispr.borrow(py).crispr.len().saturating_sub(1)
154    }
155
156    pub fn __getitem__<'py>(&self, py: Python<'py>, index: usize) -> PyResult<Py<Spacer>> {
157        self.crispr
158            .bind(py)
159            .borrow()
160            .crispr
161            .spacers()
162            .nth(index)
163            .ok_or(PyIndexError::new_err(index))
164            .and_then(|region| {
165                Py::new(
166                    py,
167                    PyClassInitializer::from(Region { region }).add_subclass(Spacer {}),
168                )
169            })
170    }
171}
172
173/// A CRISPR region in a nucleotide sequence.
174#[pyclass(module = "diced.lib")]
175pub struct Crispr {
176    crispr: diced::Crispr<Sequence>,
177}
178
179#[pymethods]
180impl Crispr {
181    /// `int`: The start coordinate of the CRISPR region (zero-based).
182    #[getter]
183    pub fn start(&self) -> usize {
184        self.crispr.start()
185    }
186
187    /// `int`: The end coordinate of the CRISPR region (zero-based, exclusive).
188    #[getter]
189    pub fn end(&self) -> usize {
190        self.crispr.end()
191    }
192
193    /// `~diced.Repeats`: The list of repeats inside the CRISPR region.
194    #[getter]
195    pub fn repeats(slf: Py<Self>) -> Repeats {
196        Repeats { crispr: slf }
197    }
198
199    /// `~diced.Spacers`: The list of spacers inside the CRISPR region.
200    #[getter]
201    pub fn spacers(slf: Py<Self>) -> Spacers {
202        Spacers { crispr: slf }
203    }
204
205    pub fn __len__(&self) -> usize {
206        self.crispr.len()
207    }
208
209    pub fn __str__<'py>(&self, py: Python<'py>) -> Bound<'py, PyString> {
210        PyString::new(py, self.crispr.to_region().as_str())
211    }
212}
213
214/// A scanner for iterating on the CRISPR regions of a genome.
215#[pyclass(module = "diced.lib")]
216pub struct Scanner {
217    scanner: diced::Scanner<Sequence>,
218}
219
220#[pymethods]
221impl Scanner {
222    fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
223        slf
224    }
225
226    /// Return the next CRISPR region, if any.
227    ///
228    /// Returns:
229    ///     `~diced.Crispr`: The next CRISPR region in the sequence.
230    ///
231    /// Raises:
232    ///     `StopIteration`: When the end of the sequence has been reached
233    ///         without finding new CRISPR regions.
234    ///
235    fn __next__<'py>(&mut self, py: Python<'py>) -> PyResult<Option<Crispr>> {
236        match py.detach(move || self.scanner.next()) {
237            Some(crispr) => Ok(Some(Crispr { crispr })),
238            None => Ok(None),
239        }
240    }
241
242    /// `str`: The genomic sequence being scanned.
243    #[getter]
244    fn sequence<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyString>> {
245        self.scanner
246            .sequence()
247            .data
248            .clone_ref(py)
249            .into_pyobject(py)
250            .map_err(PyErr::from)
251    }
252}
253
254/// Scan a genome sequence for CRISPRs repeats.
255///
256/// Arguments:
257///     sequence (`str`): A string containing the genomic sequence to build
258///         a scanner for.
259///
260/// Returns:
261///     `~diced.Scanner`: A scanner yielding CRISPRs in the given contig.
262///
263#[pyfunction]
264pub fn scan(sequence: PyBackedStr) -> PyResult<Scanner> {
265    let builder = diced::ScannerBuilder::new();
266    let scanner = builder.scan(Sequence::from(sequence));
267    Ok(Scanner { scanner })
268}
269
270/// PyO3 bindings to ``diced``, a library for CRISPRs detection.
271///
272/// Diced is re-implementation of MinCED, a method developed by
273/// `Connor T. Skennerton <https://github.com/ctSkennerton>`_ to identify
274/// CRISPRs in isolate and metagenomic-assembled genomes. It was derived
275/// from the CRISPR recognition tool developed by Charles Bland *et al.*.
276///
277/// Example:
278///     Load a genome from a FASTA file using Biopython::
279///
280///         >>> import Bio.SeqIO
281///         >>> record = Bio.SeqIO.read("Aquifex_aeolicus_VF5.fna", "fasta")
282///
283///     Detect CRISPR regions with Diced using the default parameters::
284///
285///         >>> import diced
286///         >>> for crispr in diced.scan(str(record.seq[:300000])):
287///         ...     print(
288///         ...         crispr.start,
289///         ...         crispr.end,
290///         ...         len(crispr.repeats),
291///         ...         crispr.repeats[0],
292///         ...     )
293///         156459 156767 5 GTTCCTAATGTACCGTGTGGAGTTGAAACC
294///         244560 244791 4 GTTTCAACTCCACACGGTACATTAGGAAC
295///         279263 279555 5 GTTTTAACTCCACACGGTACATTAGAAAC
296///
297#[pymodule]
298#[pyo3(name = "lib")]
299pub fn init(_py: Python, m: Bound<PyModule>) -> PyResult<()> {
300    m.add("__package__", "diced")?;
301    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
302    m.add("__author__", env!("CARGO_PKG_AUTHORS").replace(':', "\n"))?;
303
304    m.add_class::<Crispr>()?;
305    m.add_class::<Region>()?;
306    m.add_class::<Scanner>()?;
307    m.add_class::<Repeat>()?;
308    m.add_class::<Repeats>()?;
309    m.add_class::<Spacer>()?;
310    m.add_class::<Spacers>()?;
311
312    m.add_function(wrap_pyfunction!(scan, &m)?)?;
313
314    Ok(())
315}