diced_py/
lib.rs

1#![doc = include_str!("../README.md")]
2
3extern crate diced;
4extern crate pyo3;
5
6use pyo3::exceptions::PyIndexError;
7use pyo3::prelude::*;
8use pyo3::pybacked::PyBackedStr;
9use pyo3::types::PySlice;
10use pyo3::types::PyString;
11
12/// The actual storage for the sequence data.
13#[derive(Debug)]
14struct Sequence {
15    data: PyBackedStr,
16}
17
18impl From<PyBackedStr> for Sequence {
19    fn from(data: PyBackedStr) -> Self {
20        Self { data }
21    }
22}
23
24impl AsRef<str> for Sequence {
25    fn as_ref(&self) -> &str {
26        self.data.as_ref()
27    }
28}
29
30impl Clone for Sequence {
31    fn clone(&self) -> Self {
32        Self {
33            data: Python::with_gil(|py| {
34                self.data.to_object(py).extract::<PyBackedStr>(py).unwrap()
35            }),
36        }
37    }
38}
39
40/// A sequence region.
41#[pyclass(module = "diced.lib", frozen, subclass)]
42pub struct Region {
43    region: diced::Region<Sequence>,
44}
45
46#[pymethods]
47impl Region {
48    #[new]
49    pub fn __new__<'py>(
50        py: Python<'py>,
51        sequence: PyBackedStr,
52        start: usize,
53        end: usize,
54    ) -> PyResult<PyClassInitializer<Self>> {
55        if start > end || start > sequence.len() || end > sequence.len() {
56            let s = PySlice::new_bound(py, start as isize, end as isize, 1);
57            return Err(PyIndexError::new_err(s.to_object(py)));
58        }
59        Ok(Region {
60            region: diced::Region::new(Sequence::from(sequence), start, end),
61        }
62        .into())
63    }
64
65    /// `int`: The start coordinate of the region (zero-based).
66    #[getter]
67    pub fn start(&self) -> usize {
68        self.region.start()
69    }
70
71    /// `int`: The end coordinate of the region (zero-based, exclusive).
72    #[getter]
73    pub fn end(&self) -> usize {
74        self.region.end()
75    }
76
77    /// Get the sequence region as a string.
78    pub fn __str__<'py>(&self, py: Python<'py>) -> Bound<'py, PyString> {
79        PyString::new_bound(py, self.region.as_str())
80    }
81}
82
83/// A CRISPR repeat.
84#[pyclass(module="diced.lib", extends=Region)]
85pub struct Repeat {}
86
87#[pymethods]
88impl Repeat {
89    #[new]
90    pub fn __new__<'py>(
91        py: Python<'py>,
92        sequence: PyBackedStr,
93        start: usize,
94        end: usize,
95    ) -> PyResult<PyClassInitializer<Self>> {
96        Region::__new__(py, sequence, start, end).map(|r| r.add_subclass(Repeat {}))
97    }
98}
99
100/// A list of repeats inside a CRISPR region.
101#[pyclass(module = "diced.lib", sequence)]
102pub struct Repeats {
103    crispr: Py<Crispr>,
104}
105
106#[pymethods]
107impl Repeats {
108    pub fn __len__<'py>(&self, py: Python<'py>) -> usize {
109        self.crispr.borrow(py).crispr.len()
110    }
111
112    pub fn __getitem__<'py>(&self, py: Python<'py>, index: usize) -> PyResult<Py<Repeat>> {
113        self.crispr
114            .bind(py)
115            .borrow()
116            .crispr
117            .repeats()
118            .nth(index)
119            .ok_or(PyIndexError::new_err(index))
120            .and_then(|region| {
121                Py::new(
122                    py,
123                    PyClassInitializer::from(Region { region }).add_subclass(Repeat {}),
124                )
125            })
126    }
127}
128
129/// A CRISPR spacer.
130#[pyclass(module="diced.lib", extends=Region)]
131pub struct Spacer {}
132
133#[pymethods]
134impl Spacer {
135    #[new]
136    pub fn __new__<'py>(
137        py: Python<'py>,
138        sequence: PyBackedStr,
139        start: usize,
140        end: usize,
141    ) -> PyResult<PyClassInitializer<Self>> {
142        Region::__new__(py, sequence, start, end).map(|r| r.add_subclass(Spacer {}))
143    }
144}
145
146/// A list of spacers inside a CRISPR region.
147#[pyclass(module = "diced.lib", sequence)]
148pub struct Spacers {
149    crispr: Py<Crispr>,
150}
151
152#[pymethods]
153impl Spacers {
154    pub fn __len__<'py>(&self, py: Python<'py>) -> usize {
155        self.crispr.borrow(py).crispr.len().saturating_sub(1)
156    }
157
158    pub fn __getitem__<'py>(&self, py: Python<'py>, index: usize) -> PyResult<Py<Spacer>> {
159        self.crispr
160            .bind(py)
161            .borrow()
162            .crispr
163            .spacers()
164            .nth(index)
165            .ok_or(PyIndexError::new_err(index))
166            .and_then(|region| {
167                Py::new(
168                    py,
169                    PyClassInitializer::from(Region { region }).add_subclass(Spacer {}),
170                )
171            })
172    }
173}
174
175/// A CRISPR region in a nucleotide sequence.
176#[pyclass(module = "diced.lib")]
177pub struct Crispr {
178    crispr: diced::Crispr<Sequence>,
179}
180
181#[pymethods]
182impl Crispr {
183    /// `int`: The start coordinate of the CRISPR region (zero-based).
184    #[getter]
185    pub fn start(&self) -> usize {
186        self.crispr.start()
187    }
188
189    /// `int`: The end coordinate of the CRISPR region (zero-based, exclusive).
190    #[getter]
191    pub fn end(&self) -> usize {
192        self.crispr.end()
193    }
194
195    /// `~diced.Repeats`: The list of repeats inside the CRISPR region.
196    #[getter]
197    pub fn repeats(slf: Py<Self>) -> Repeats {
198        Repeats { crispr: slf }
199    }
200
201    /// `~diced.Spacers`: The list of spacers inside the CRISPR region.
202    #[getter]
203    pub fn spacers(slf: Py<Self>) -> Spacers {
204        Spacers { crispr: slf }
205    }
206
207    pub fn __len__(&self) -> usize {
208        self.crispr.len()
209    }
210
211    pub fn __str__<'py>(&self, py: Python<'py>) -> Bound<'py, PyString> {
212        PyString::new_bound(py, self.crispr.to_region().as_str())
213    }
214}
215
216/// A scanner for iterating on the CRISPR regions of a genome.
217#[pyclass(module = "diced.lib")]
218pub struct Scanner {
219    scanner: diced::Scanner<Sequence>,
220}
221
222#[pymethods]
223impl Scanner {
224    fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
225        slf
226    }
227
228    /// Return the next CRISPR region, if any.
229    ///
230    /// Returns:
231    ///     `~diced.Crispr`: The next CRISPR region in the sequence.
232    ///
233    /// Raises:
234    ///     `StopIteration`: When the end of the sequence has been reached
235    ///         without finding new CRISPR regions.
236    ///
237    fn __next__<'py>(&mut self, py: Python<'py>) -> PyResult<Option<Crispr>> {
238        match py.allow_threads(move || self.scanner.next()) {
239            Some(crispr) => Ok(Some(Crispr { crispr })),
240            None => Ok(None),
241        }
242    }
243
244    /// `str`: The genomic sequence being scanned.
245    #[getter]
246    fn sequence<'py>(&self, py: Python<'py>) -> Py<PyAny> {
247        self.scanner.sequence().data.to_object(py)
248    }
249}
250
251/// Scan a genome sequence for CRISPRs repeats.
252///
253/// Arguments:
254///     sequence (`str`): A string containing the genomic sequence to build
255///         a scanner for.
256///
257/// Returns:
258///     `~diced.Scanner`: A scanner yielding CRISPRs in the given contig.
259///
260#[pyfunction]
261pub fn scan(sequence: PyBackedStr) -> PyResult<Scanner> {
262    let builder = diced::ScannerBuilder::new();
263    let scanner = builder.scan(Sequence::from(sequence));
264    Ok(Scanner { scanner })
265}
266
267/// PyO3 bindings to ``diced``, a library for CRISPRs detection.
268///
269/// Diced is re-implementation of MinCED, a method developed by
270/// `Connor T. Skennerton <https://github.com/ctSkennerton>`_ to identify
271/// CRISPRs in isolate and metagenomic-assembled genomes. It was derived
272/// from the CRISPR recognition tool developed by Charles Bland *et al.*.
273///
274/// Example:
275///     Load a genome from a FASTA file using Biopython::
276///
277///         >>> import Bio.SeqIO
278///         >>> record = Bio.SeqIO.read("Aquifex_aeolicus_VF5.fna", "fasta")
279///
280///     Detect CRISPR regions with Diced using the default parameters::
281///
282///         >>> import diced
283///         >>> for crispr in diced.scan(str(record.seq[:300000])):
284///         ...     print(
285///         ...         crispr.start,
286///         ...         crispr.end,
287///         ...         len(crispr.repeats),
288///         ...         crispr.repeats[0],
289///         ...     )
290///         156459 156767 5 GTTCCTAATGTACCGTGTGGAGTTGAAACC
291///         244560 244791 4 GTTTCAACTCCACACGGTACATTAGGAAC
292///         279263 279555 5 GTTTTAACTCCACACGGTACATTAGAAAC
293///
294#[pymodule]
295#[pyo3(name = "lib")]
296pub fn init(_py: Python, m: Bound<PyModule>) -> PyResult<()> {
297    m.add("__package__", "diced")?;
298    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
299    m.add("__author__", env!("CARGO_PKG_AUTHORS").replace(':', "\n"))?;
300
301    m.add_class::<Crispr>()?;
302    m.add_class::<Region>()?;
303    m.add_class::<Scanner>()?;
304    m.add_class::<Repeat>()?;
305    m.add_class::<Repeats>()?;
306    m.add_class::<Spacer>()?;
307    m.add_class::<Spacers>()?;
308
309    m.add_function(wrap_pyfunction!(scan, &m)?)?;
310
311    Ok(())
312}