nafcodec_py/
lib.rs

1#![doc = include_str!("../README.md")]
2
3#[macro_use]
4extern crate pyo3_built;
5extern crate nafcodec;
6extern crate pyo3;
7
8mod pyfile;
9
10use std::borrow::Cow;
11use std::convert::Infallible;
12use std::io::BufReader;
13use std::ops::DerefMut;
14
15use nafcodec::DecoderBuilder;
16use pyo3::exceptions::PyFileNotFoundError;
17use pyo3::exceptions::PyIsADirectoryError;
18use pyo3::exceptions::PyOSError;
19use pyo3::exceptions::PyRuntimeError;
20use pyo3::exceptions::PyUnicodeError;
21use pyo3::exceptions::PyValueError;
22use pyo3::prelude::*;
23use pyo3::types::PyDict;
24use pyo3::types::PyList;
25use pyo3::types::PyString;
26use pyo3::PyTypeInfo;
27
28use self::pyfile::PyFileRead;
29use self::pyfile::PyFileReadWrapper;
30use self::pyfile::PyFileWrite;
31use self::pyfile::PyFileWriteWrapper;
32
33#[allow(dead_code)]
34mod build {
35    include!(concat!(env!("OUT_DIR"), "/built.rs"));
36}
37
38/// Convert a `nafcodec::error::Error` into a Python exception.
39fn convert_error(_py: Python, error: nafcodec::error::Error, path: Option<&str>) -> PyErr {
40    use nafcodec::error::Error;
41
42    match error {
43        Error::Utf8(_utf8_error) => PyUnicodeError::new_err("failed to decode UTF-8 data"),
44        Error::Nom(nom_error) => {
45            PyValueError::new_err(format!("parser failed: {:?}", nom_error.code))
46        }
47        Error::MissingField(field) => {
48            PyValueError::new_err(format!("missing record field: {:?}", field))
49        }
50        Error::InvalidLength => PyValueError::new_err("inconsistent sequence length"),
51        Error::InvalidSequence => PyValueError::new_err("invalid characters found in sequence"),
52        Error::Io(io_error) => {
53            let desc = io_error.to_string();
54            if let Some(p) = path.map(str::to_string) {
55                match io_error.raw_os_error() {
56                    Some(2) => PyFileNotFoundError::new_err((p,)),
57                    #[cfg(target_os = "windows")]
58                    Some(3) => PyFileNotFoundError::new_err((p,)),
59                    #[cfg(not(target_os = "windows"))]
60                    Some(21) => PyIsADirectoryError::new_err((p,)),
61                    Some(code) => PyOSError::new_err((code, desc, p)),
62                    None => PyOSError::new_err((desc,)),
63                }
64            } else {
65                match io_error.raw_os_error() {
66                    Some(2) => PyFileNotFoundError::new_err((desc,)),
67                    #[cfg(target_os = "windows")]
68                    Some(3) => PyFileNotFoundError::new_err((desc,)),
69                    #[cfg(not(target_os = "windows"))]
70                    Some(21) => PyIsADirectoryError::new_err((desc,)),
71                    Some(code) => PyOSError::new_err((code, desc)),
72                    None => PyOSError::new_err((desc,)),
73                }
74            }
75        }
76    }
77}
78
79// ---------------------------------------------------------------------------
80
81pub struct SequenceType(nafcodec::SequenceType);
82
83impl<'py> FromPyObject<'py> for SequenceType {
84    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
85        let py = ob.py();
86        match ob.downcast::<PyString>()?.to_string_lossy().as_ref() {
87            "dna" => Ok(SequenceType(nafcodec::SequenceType::Dna)),
88            "rna" => Ok(SequenceType(nafcodec::SequenceType::Rna)),
89            "protein" => Ok(SequenceType(nafcodec::SequenceType::Protein)),
90            "text" => Ok(SequenceType(nafcodec::SequenceType::Text)),
91            other => {
92                let msg = PyString::new(py, "expected 'dna', 'rna', 'protein' or 'text', got {!r}")
93                    .call_method1("format", (other,))?
94                    .unbind()
95                    .into_any();
96                Err(PyValueError::new_err(msg))
97            }
98        }
99    }
100}
101
102impl<'py> IntoPyObject<'py> for SequenceType {
103    type Target = PyString;
104    type Output = Bound<'py, Self::Target>;
105    type Error = Infallible;
106
107    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
108        let tag = match self.0 {
109            nafcodec::SequenceType::Dna => pyo3::intern!(py, "dna"),
110            nafcodec::SequenceType::Rna => pyo3::intern!(py, "rna"),
111            nafcodec::SequenceType::Protein => pyo3::intern!(py, "protein"),
112            nafcodec::SequenceType::Text => pyo3::intern!(py, "text"),
113        };
114        Ok(tag.clone())
115    }
116}
117
118impl From<nafcodec::SequenceType> for SequenceType {
119    fn from(ty: nafcodec::SequenceType) -> Self {
120        Self(ty)
121    }
122}
123
124impl From<SequenceType> for nafcodec::SequenceType {
125    fn from(ty: SequenceType) -> Self {
126        ty.0
127    }
128}
129
130// ---------------------------------------------------------------------------
131
132#[derive(Clone, Copy, PartialEq)]
133pub enum OpenMode {
134    Read,
135    Write,
136}
137
138impl<'py> FromPyObject<'py> for OpenMode {
139    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
140        let py = ob.py();
141        match ob.downcast::<PyString>()?.to_string_lossy().as_ref() {
142            "r" => Ok(OpenMode::Read),
143            "w" => Ok(OpenMode::Write),
144            other => {
145                let msg = PyString::new(py, "expected 'r' or 'w', got {!r}")
146                    .call_method1("format", (other,))?
147                    .unbind()
148                    .into_any();
149                Err(PyValueError::new_err(msg))
150            }
151        }
152    }
153}
154
155// ---------------------------------------------------------------------------
156
157/// A single sequence record stored in a Nucleotide Archive Format file.
158#[pyclass(module = "nafcodec")]
159#[derive(Clone, Debug)]
160pub struct Record {
161    /// `str` or `None`: The record identifier.
162    #[pyo3(get, set)]
163    id: Option<Py<PyString>>,
164    /// `str` or `None`: The record comment.
165    #[pyo3(get, set)]
166    comment: Option<Py<PyString>>,
167    /// `str` or `None`: The record sequence.
168    #[pyo3(get, set)]
169    sequence: Option<Py<PyString>>,
170    /// `str` or `None`: The record quality.
171    #[pyo3(get, set)]
172    quality: Option<Py<PyString>>,
173    /// `str` or `None`: The record sequence length.
174    #[pyo3(get, set)]
175    length: Option<u64>,
176}
177
178impl Record {
179    pub fn from_py<'py>(py: Python<'py>, record: nafcodec::Record) -> Self {
180        let id = record.id.map(|x| PyString::new(py, &x).into());
181        let sequence = record.sequence.map(|x| PyString::new(py, &x).into());
182        let comment = record.comment.map(|x| PyString::new(py, &x).into());
183        let quality = record.quality.map(|x| PyString::new(py, &x).into());
184        let length = record.length;
185        Self {
186            id,
187            sequence,
188            comment,
189            quality,
190            length,
191        }
192    }
193}
194
195#[pymethods]
196impl Record {
197    #[new]
198    #[pyo3(signature = (*, id=None, comment=None, sequence=None, quality=None, length=None))]
199    fn __init__<'py>(
200        py: Python<'py>,
201        id: Option<Py<PyString>>,
202        comment: Option<Py<PyString>>,
203        sequence: Option<Py<PyString>>,
204        quality: Option<Py<PyString>>,
205        mut length: Option<u64>,
206    ) -> PyResult<PyClassInitializer<Self>> {
207        // Check lengths are consistent.
208        if let Some(seq) = sequence.as_ref() {
209            if let Some(qual) = quality.as_ref() {
210                if seq.bind(py).len()? != qual.bind(py).len()? {
211                    return Err(PyValueError::new_err(
212                        "lengths of sequence and quality don't match",
213                    ));
214                }
215            }
216            if let Some(&l) = length.as_ref() {
217                if seq.bind(py).len()? != l as usize {
218                    return Err(PyValueError::new_err(
219                        "length of sequence and record length don't match",
220                    ));
221                }
222            } else {
223                length = Some(seq.bind(py).len()? as u64);
224            }
225        }
226        if let Some(qual) = quality.as_ref() {
227            if let Some(&l) = length.as_ref() {
228                if qual.bind(py).len()? != l as usize {
229                    return Err(PyValueError::new_err(
230                        "length of quality and record length don't match",
231                    ));
232                }
233            } else {
234                length = Some(qual.bind(py).len()? as u64);
235            }
236        }
237
238        Ok(PyClassInitializer::from(Record {
239            id,
240            comment,
241            sequence,
242            quality,
243            length,
244        }))
245    }
246
247    fn __repr__<'py>(slf: &Bound<'py, Self>) -> PyResult<Bound<'py, PyAny>> {
248        let py = slf.py();
249        let format = pyo3::intern!(py, "format");
250        let args = PyList::empty(py);
251        let record = slf.borrow();
252        if let Some(id) = &record.id {
253            args.append(pyo3::intern!(py, "id={!r}").call_method1(format, (id,))?)?;
254        }
255        if let Some(comment) = &record.comment {
256            args.append(pyo3::intern!(py, "comment={!r}").call_method1(format, (comment,))?)?;
257        }
258        if let Some(sequence) = &record.sequence {
259            args.append(pyo3::intern!(py, "sequence={!r}").call_method1(format, (sequence,))?)?;
260        }
261        if let Some(quality) = &record.quality {
262            args.append(pyo3::intern!(py, "quality={!r}").call_method1(format, (quality,))?)?;
263        }
264        if let Some(length) = &record.length {
265            args.append(format!("length={}", length))?;
266        }
267        pyo3::intern!(py, "{}({})").call_method1(
268            format,
269            (
270                slf.get_type().name()?,
271                pyo3::intern!(py, ", ").call_method1("join", (args,))?,
272            ),
273        )
274    }
275}
276
277impl TryFrom<&Record> for nafcodec::Record<'static> {
278    type Error = PyErr;
279    fn try_from(value: &Record) -> Result<Self, PyErr> {
280        Python::with_gil(|py| {
281            let id = value
282                .id
283                .as_ref()
284                .map(|s| s.to_str(py))
285                .transpose()?
286                .map(String::from)
287                .map(Cow::Owned);
288            let comment = value
289                .comment
290                .as_ref()
291                .map(|s| s.to_str(py))
292                .transpose()?
293                .map(String::from)
294                .map(Cow::Owned);
295            let sequence = value
296                .sequence
297                .as_ref()
298                .map(|s| s.to_str(py))
299                .transpose()?
300                .map(String::from)
301                .map(Cow::Owned);
302            let quality = value
303                .quality
304                .as_ref()
305                .map(|s| s.to_str(py))
306                .transpose()?
307                .map(String::from)
308                .map(Cow::Owned);
309            let length = value.length.clone();
310            Ok(nafcodec::Record {
311                id,
312                comment,
313                sequence,
314                quality,
315                length,
316            })
317        })
318    }
319}
320
321// ---------------------------------------------------------------------------
322
323/// A streaming decoder to read a Nucleotide Archive Format file.
324#[pyclass(module = "nafcodec")]
325pub struct Decoder {
326    decoder: nafcodec::Decoder<'static, BufReader<PyFileReadWrapper>>,
327}
328
329#[pymethods]
330impl Decoder {
331    #[new]
332    #[pyo3(signature = (file, *, id=true, comment=true, sequence=true, quality=true, mask=true, buffer_size=None))]
333    pub fn __init__<'py>(
334        file: Bound<'py, PyAny>,
335        id: bool,
336        comment: bool,
337        sequence: bool,
338        quality: bool,
339        mask: bool,
340        buffer_size: Option<usize>,
341    ) -> PyResult<PyClassInitializer<Self>> {
342        let py = file.py();
343
344        let mut builder = DecoderBuilder::new();
345        builder.id(id);
346        builder.comment(comment);
347        builder.sequence(sequence);
348        builder.quality(quality);
349        builder.mask(mask);
350        builder.buffer_size(buffer_size.map(Ok).unwrap_or_else(|| {
351            py.import(pyo3::intern!(py, "io"))?
352                .getattr(pyo3::intern!(py, "DEFAULT_BUFFER_SIZE"))?
353                .extract::<usize>()
354        })?);
355
356        let decoder = match PyFileRead::from_ref(&file) {
357            Ok(handle) => {
358                let wrapper = PyFileReadWrapper::PyFile(handle);
359                builder
360                    .with_reader(std::io::BufReader::new(wrapper))
361                    .map_err(|e| convert_error(py, e, None))?
362            }
363            Err(_e) => {
364                let path = py
365                    .import("os")?
366                    .call_method1(pyo3::intern!(py, "fspath"), (file,))?
367                    .extract::<Bound<'_, PyString>>()?;
368                let path_str = path.to_str()?;
369                let wrapper = std::fs::File::open(path_str)
370                    .map_err(nafcodec::error::Error::Io)
371                    .map_err(|e| convert_error(py, e, Some(path_str)))
372                    .map(PyFileReadWrapper::File)?;
373                builder
374                    .with_reader(std::io::BufReader::new(wrapper))
375                    .map_err(|e| convert_error(py, e, Some(path_str)))?
376            }
377        };
378
379        Ok(Decoder { decoder }.into())
380    }
381
382    pub fn __iter__(slf: PyRef<'_, Self>) -> PyResult<PyRef<'_, Self>> {
383        Ok(slf)
384    }
385
386    pub fn __len__(slf: PyRef<'_, Self>) -> PyResult<usize> {
387        Ok(slf.decoder.len())
388    }
389
390    pub fn __next__(mut slf: PyRefMut<'_, Self>) -> PyResult<Option<Record>> {
391        let py = slf.py();
392        let result = slf.deref_mut().decoder.next().transpose();
393        match result {
394            Ok(None) => Ok(None),
395            Ok(Some(record)) => Ok(Some(Record::from_py(py, record))),
396            Err(e) => Err(convert_error(py, e, None)),
397        }
398    }
399
400    pub fn __enter__<'py>(slf: PyRef<'py, Self>) -> PyRef<'py, Self> {
401        slf
402    }
403
404    #[allow(unused)]
405    pub fn __exit__<'py>(
406        slf: PyRefMut<'py, Self>,
407        exc_type: Bound<'py, PyAny>,
408        exc_value: Bound<'py, PyAny>,
409        traceback: Bound<'py, PyAny>,
410    ) -> PyResult<bool> {
411        Ok(false)
412    }
413
414    /// `str`: The type of sequence stored in the archive.
415    #[getter]
416    pub fn sequence_type(slf: PyRef<'_, Self>) -> SequenceType {
417        SequenceType(slf.decoder.sequence_type())
418    }
419
420    /// `str`: The length of sequence lines in the original FASTA file.
421    #[getter]
422    pub fn format_version(slf: PyRef<'_, Self>) -> &Bound<'_, PyString> {
423        use nafcodec::FormatVersion;
424        let py = slf.py();
425        match slf.decoder.header().format_version() {
426            FormatVersion::V1 => pyo3::intern!(py, "v1"),
427            FormatVersion::V2 => pyo3::intern!(py, "v2"),
428        }
429    }
430
431    /// `int`: The length of sequence lines in the original FASTA file.
432    #[getter]
433    pub fn line_length(slf: PyRef<'_, Self>) -> u64 {
434        slf.decoder.header().line_length()
435    }
436
437    /// `str`: The separator between sequence identifiers and comments.
438    #[getter]
439    pub fn name_separator(slf: PyRef<'_, Self>) -> char {
440        slf.decoder.header().name_separator()
441    }
442
443    /// `int`: The total number of sequences stored in the archive.
444    #[getter]
445    pub fn number_of_sequences(slf: PyRef<'_, Self>) -> u64 {
446        slf.decoder.header().number_of_sequences()
447    }
448
449    /// Read the next record from the archive.
450    ///
451    /// This method will returns `None` when no more records are available.
452    pub fn read(mut slf: PyRefMut<'_, Self>) -> PyResult<Option<Record>> {
453        let py = slf.py();
454        let result = slf.deref_mut().decoder.next().transpose();
455        match result {
456            Ok(None) => Ok(None),
457            Ok(Some(record)) => Ok(Some(Record::from_py(py, record))),
458            Err(e) => Err(convert_error(py, e, None)),
459        }
460    }
461}
462
463// ---------------------------------------------------------------------------
464
465/// An encoder to iteratively write a Nucleotide Archive Format file.
466#[pyclass(module = "nafcodec")]
467pub struct Encoder {
468    encoder: Option<nafcodec::Encoder<'static, nafcodec::Memory>>,
469    file: PyFileWriteWrapper,
470}
471
472#[pymethods]
473impl Encoder {
474    #[new]
475    #[pyo3(signature=(
476        file,
477        sequence_type=SequenceType(nafcodec::SequenceType::Dna),
478        *,
479        id = false,
480        comment = false,
481        sequence = false,
482        quality = false,
483        compression_level = 0,
484    ))]
485    pub fn __init__<'py>(
486        file: Bound<'py, PyAny>,
487        sequence_type: SequenceType,
488        id: bool,
489        comment: bool,
490        sequence: bool,
491        quality: bool,
492        compression_level: i32,
493    ) -> PyResult<PyClassInitializer<Self>> {
494        let py = file.py();
495        let file = match PyFileWrite::from_ref(&file) {
496            Ok(handle) => PyFileWriteWrapper::PyFile(handle),
497            Err(_e) => {
498                let path = py
499                    .import("os")?
500                    .call_method1(pyo3::intern!(py, "fspath"), (file,))?
501                    .extract::<Bound<'_, PyString>>()?;
502                let path_str = path.to_str()?;
503                std::fs::File::create(path_str)
504                    .map_err(nafcodec::error::Error::Io)
505                    .map_err(|e| convert_error(py, e, Some(path_str)))
506                    .map(PyFileWriteWrapper::File)?
507            }
508        };
509        let encoder = nafcodec::EncoderBuilder::new(sequence_type.0)
510            .id(id)
511            .comment(comment)
512            .quality(quality)
513            .sequence(sequence)
514            .compression_level(compression_level)
515            .with_memory()
516            .map(Some)
517            .map_err(|e| convert_error(py, e, None))?;
518        Ok(Self { file, encoder }.into())
519    }
520
521    pub fn __enter__<'py>(slf: PyRef<'py, Self>) -> PyRef<'py, Self> {
522        slf
523    }
524
525    #[allow(unused)]
526    pub fn __exit__<'py>(
527        slf: PyRefMut<'py, Self>,
528        exc_type: Bound<'py, PyAny>,
529        exc_value: Bound<'py, PyAny>,
530        traceback: Bound<'py, PyAny>,
531    ) -> PyResult<bool> {
532        Encoder::close(slf)?;
533        Ok(false)
534    }
535
536    pub fn write<'py>(mut slf: PyRefMut<'py, Self>, record: &'py Record) -> PyResult<()> {
537        let py = slf.py();
538
539        // This macro allows borrowing a field from the Python record and get
540        // a `Cow<'py, str>` instead of copying the data.
541        //
542        // The problem here is that the borrow need to live long enough
543        // for all fields to be read and the resulting record written to the
544        // encoder. However, because every field is optional, the borrows
545        // would occur in `if let` blocks:
546        //
547        // ```
548        // if let Some(x) = record.id {
549        //     id = Some(x.bind(py).as_borrowed().to_str());
550        // }
551        // ```
552        //
553        // To avoid this, we store the borrowed reference in an external
554        // variable that lives longer than the `if let` scope.
555        //
556        macro_rules! borrow_field {
557            ($field:ident) => {
558                #[allow(unused_assignments)]
559                let mut borrowed = None;
560                let mut $field = None;
561                if let Some(x) = record.$field.as_ref() {
562                    let s = x.bind(py);
563                    let b = s.as_borrowed();
564                    borrowed = Some(b);
565                    $field = borrowed.as_ref().map(|b| b.to_cow()).transpose()?;
566                }
567            };
568        }
569
570        if let Some(encoder) = slf.encoder.as_mut() {
571            borrow_field!(id);
572            borrow_field!(comment);
573            borrow_field!(sequence);
574            borrow_field!(quality);
575            let r = nafcodec::Record {
576                id,
577                comment,
578                sequence,
579                quality,
580                length: record.length.clone(),
581            };
582            encoder.push(&r).map_err(|err| convert_error(py, err, None))
583        } else {
584            Err(PyRuntimeError::new_err("operation on closed encoder."))
585        }
586    }
587
588    pub fn close<'py>(mut slf: PyRefMut<'py, Self>) -> PyResult<()> {
589        let py = slf.py();
590        if let Some(encoder) = slf.encoder.take() {
591            encoder
592                .write(&mut slf.file)
593                .map_err(|e| convert_error(py, e, None))?;
594        }
595        Ok(())
596    }
597}
598
599/// An encoder/decoder for Nucleotide Archive Format files.
600#[pymodule]
601#[pyo3(name = "lib")]
602pub fn init<'py>(py: Python<'py>, m: &Bound<'py, PyModule>) -> PyResult<()> {
603    m.add("__package__", "nafcodec")?;
604    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
605    m.add("__author__", env!("CARGO_PKG_AUTHORS").replace(':', "\n"))?;
606    m.add("__build__", pyo3_built!(py, build))?;
607
608    m.add_class::<Decoder>()?;
609    m.add_class::<Encoder>()?;
610    m.add_class::<Record>()?;
611
612    /// Open a Nucleotide Archive Format file.
613    ///
614    /// This function acts as a high-level wrapper and returns either
615    /// a `~nafcodec.Decoder` or an `~nafcodec.Encoder` depending on the
616    /// provided mode.
617    ///
618    /// Arguments:
619    ///     file (`str`, `pathlib.Path` or file-like object): The file to
620    ///         read the archive from, or write the archive to.
621    ///     mode (`str`): The mode to open the archive with, either 'r'
622    ///         to read an existing archive, or 'w' to write a new
623    ///         archive.
624    ///     options (`object`): Additional options to pass to the
625    ///         `~nafcodec.Decoder` or `~nafcodec.Encoder` constructors.
626    ///
627    /// Example:
628    ///     Open an archive and read all the records from an existing
629    ///     archive into a `list`::
630    ///
631    ///     >>> with open("LuxC.naf") as decoder:
632    ///     ...     records = list(decoder)
633    ///
634    ///     Create a new archive for recording FASTA records (identifiers
635    ///     and DNA sequences)::
636    ///
637    ///     >>> with tempfile.NamedTemporaryFile() as dst:
638    ///     ...     with open(dst, "w", id=True, sequence=True) as encoder:
639    ///     ...         encoder.write(Record(id="r1", sequence="ATGC"))
640    ///
641    #[pyfn(m)]
642    #[pyo3(signature = (file, mode = OpenMode::Read, **options))]
643    fn open<'py>(
644        file: &Bound<'py, PyAny>,
645        mode: OpenMode,
646        options: Option<&Bound<'py, PyDict>>,
647    ) -> PyResult<Bound<'py, PyAny>> {
648        let py = file.py();
649        match mode {
650            OpenMode::Read => Decoder::type_object(py).call((file,), options),
651            OpenMode::Write => Encoder::type_object(py).call((file,), options),
652        }
653    }
654
655    Ok(())
656}