Skip to main content

cyanea_seq/
seq.rs

1//! Generic validated sequence type.
2//!
3//! [`ValidatedSeq<A>`] is a newtype over `Vec<u8>` parameterized by an
4//! [`Alphabet`] marker type. Construction uppercases and validates every byte.
5//! The inner data is always uppercase, so `Deref<Target=[u8]>` and
6//! `as_bytes()` are zero-cost and safe to pass to downstream `&[u8]` APIs.
7
8use std::fmt;
9use std::hash::{Hash, Hasher};
10use std::marker::PhantomData;
11use std::ops::Deref;
12
13use cyanea_core::{CyaneaError, ContentAddressable, Sequence, Summarizable};
14
15use crate::alphabet::Alphabet;
16
17/// A validated biological sequence parameterized by its alphabet.
18///
19/// `ValidatedSeq<DnaAlphabet>` is a DNA sequence, `ValidatedSeq<RnaAlphabet>`
20/// is RNA, etc. The inner bytes are always uppercase.
21#[derive(Clone)]
22pub struct ValidatedSeq<A: Alphabet> {
23    data: Vec<u8>,
24    _alphabet: PhantomData<A>,
25}
26
27impl<A: Alphabet> ValidatedSeq<A> {
28    /// Create a new validated sequence from raw bytes.
29    ///
30    /// Input is uppercased, then every byte is checked against the alphabet.
31    /// Returns an error if any byte is not in the alphabet after uppercasing.
32    pub fn new(bytes: impl AsRef<[u8]>) -> cyanea_core::Result<Self> {
33        let data: Vec<u8> = bytes.as_ref().iter().map(|b| b.to_ascii_uppercase()).collect();
34        for (i, &b) in data.iter().enumerate() {
35            if !A::is_valid(b) {
36                return Err(CyaneaError::InvalidInput(format!(
37                    "invalid {} byte '{}' (0x{:02X}) at position {}",
38                    A::NAME,
39                    b as char,
40                    b,
41                    i
42                )));
43            }
44        }
45        Ok(Self {
46            data,
47            _alphabet: PhantomData,
48        })
49    }
50
51    /// Create a sequence from pre-validated bytes, skipping validation.
52    ///
53    /// # Safety (logical)
54    ///
55    /// Caller must guarantee all bytes are valid uppercase members of `A`.
56    pub(crate) fn from_validated(data: Vec<u8>) -> Self {
57        Self {
58            data,
59            _alphabet: PhantomData,
60        }
61    }
62
63    /// Consume the sequence and return the inner byte vector.
64    pub fn into_bytes(self) -> Vec<u8> {
65        self.data
66    }
67}
68
69impl<A: Alphabet> Deref for ValidatedSeq<A> {
70    type Target = [u8];
71
72    fn deref(&self) -> &[u8] {
73        &self.data
74    }
75}
76
77impl<A: Alphabet> AsRef<[u8]> for ValidatedSeq<A> {
78    fn as_ref(&self) -> &[u8] {
79        &self.data
80    }
81}
82
83impl<A: Alphabet> Sequence for ValidatedSeq<A> {
84    fn as_bytes(&self) -> &[u8] {
85        &self.data
86    }
87}
88
89impl<A: Alphabet> ContentAddressable for ValidatedSeq<A> {
90    fn content_hash(&self) -> String {
91        cyanea_core::hash::sha256(&self.data)
92    }
93}
94
95impl<A: Alphabet> Summarizable for ValidatedSeq<A> {
96    fn summary(&self) -> String {
97        let preview_len = self.data.len().min(20);
98        let preview = std::str::from_utf8(&self.data[..preview_len]).unwrap_or("???");
99        if self.data.len() > 20 {
100            format!("{} sequence ({} bp): {}...", A::NAME, self.data.len(), preview)
101        } else {
102            format!("{} sequence ({} bp): {}", A::NAME, self.data.len(), preview)
103        }
104    }
105}
106
107impl<A: Alphabet> fmt::Debug for ValidatedSeq<A> {
108    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
109        let s = std::str::from_utf8(&self.data).unwrap_or("???");
110        write!(f, "{}(\"{}\")", A::NAME, s)
111    }
112}
113
114impl<A: Alphabet> fmt::Display for ValidatedSeq<A> {
115    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
116        let s = std::str::from_utf8(&self.data).unwrap_or("???");
117        f.write_str(s)
118    }
119}
120
121impl<A: Alphabet> PartialEq for ValidatedSeq<A> {
122    fn eq(&self, other: &Self) -> bool {
123        self.data == other.data
124    }
125}
126
127impl<A: Alphabet> Eq for ValidatedSeq<A> {}
128
129impl<A: Alphabet> Hash for ValidatedSeq<A> {
130    fn hash<H: Hasher>(&self, state: &mut H) {
131        self.data.hash(state);
132    }
133}
134
135#[cfg(feature = "serde")]
136impl<A: Alphabet> serde::Serialize for ValidatedSeq<A> {
137    fn serialize<S: serde::Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
138        let s = std::str::from_utf8(&self.data).map_err(serde::ser::Error::custom)?;
139        serializer.serialize_str(s)
140    }
141}
142
143#[cfg(feature = "serde")]
144impl<'de, A: Alphabet> serde::Deserialize<'de> for ValidatedSeq<A> {
145    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
146        let s = String::deserialize(deserializer)?;
147        Self::new(s.as_bytes()).map_err(serde::de::Error::custom)
148    }
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154    use crate::alphabet::DnaAlphabet;
155
156    type DnaSeq = ValidatedSeq<DnaAlphabet>;
157
158    #[test]
159    fn stores_uppercase() {
160        let seq = DnaSeq::new(b"acgt").unwrap();
161        assert_eq!(seq.as_bytes(), b"ACGT");
162    }
163
164    #[test]
165    fn empty_sequence_ok() {
166        let seq = DnaSeq::new(b"").unwrap();
167        assert!(seq.is_empty());
168        assert_eq!(seq.len(), 0);
169    }
170
171    #[test]
172    fn as_bytes_uppercase() {
173        let seq = DnaSeq::new(b"AcGtN").unwrap();
174        assert_eq!(seq.as_bytes(), b"ACGTN");
175    }
176
177    #[test]
178    fn deref_to_slice() {
179        let seq = DnaSeq::new(b"ACGT").unwrap();
180        let slice: &[u8] = &*seq;
181        assert_eq!(slice, b"ACGT");
182        assert_eq!(seq[0], b'A');
183    }
184
185    #[test]
186    fn content_addressable_deterministic() {
187        let seq1 = DnaSeq::new(b"ACGT").unwrap();
188        let seq2 = DnaSeq::new(b"acgt").unwrap();
189        assert_eq!(seq1.content_hash(), seq2.content_hash());
190    }
191
192    #[test]
193    fn rejects_invalid_bytes() {
194        let result = DnaSeq::new(b"ACGX");
195        assert!(result.is_err());
196    }
197}