Skip to main content

dryice/block/
name.rs

1//! Name codec trait and built-in implementations.
2
3use crate::error::DryIceError;
4
5/// A name encoding strategy for `dryice` blocks.
6///
7/// Unlike [`SequenceCodec`](super::sequence::SequenceCodec) and
8/// [`QualityCodec`](super::quality::QualityCodec), the name codec
9/// has an associated [`Decoded`](Self::Decoded) type that can carry
10/// richer parsed structure than raw bytes. This reflects the fact
11/// that sequencing record names are structured text with meaningful
12/// subfields.
13pub trait NameCodec: Sized {
14    /// Stable type tag written into block headers.
15    const TYPE_TAG: [u8; 16];
16
17    /// Whether this encoding is lossy.
18    const LOSSY: bool;
19
20    /// Whether the encoded form is identical to the raw input bytes.
21    const IS_IDENTITY: bool = false;
22
23    /// The decoded representation of a name.
24    type Decoded;
25
26    /// Encode raw name bytes, appending the encoded bytes directly
27    /// into the provided output buffer.
28    ///
29    /// # Errors
30    ///
31    /// Returns an error if the name data is invalid for this encoding.
32    fn encode_into(name: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError>;
33
34    /// Decode an encoded buffer into the codec's decoded representation.
35    ///
36    /// `original_len` is the number of bytes in the original name.
37    ///
38    /// # Errors
39    ///
40    /// Returns an error if the encoded data is corrupt or inconsistent.
41    fn decode(encoded: &[u8], original_len: usize) -> Result<Self::Decoded, DryIceError>;
42
43    /// View the decoded name as raw bytes for use in `SeqRecordLike`.
44    fn as_bytes(decoded: &Self::Decoded) -> &[u8];
45
46    /// Encode name bytes, returning a new allocated buffer.
47    ///
48    /// # Errors
49    ///
50    /// Returns an error if the name data is invalid for this encoding.
51    fn encode(name: &[u8]) -> Result<Vec<u8>, DryIceError> {
52        let mut out = Vec::new();
53        Self::encode_into(name, &mut out)?;
54        Ok(out)
55    }
56
57    /// Decode an encoded buffer directly to raw bytes, appending into
58    /// the provided output buffer.
59    ///
60    /// This is used internally by the block decoder to populate the
61    /// name buffer without requiring knowledge of the `Decoded` type.
62    /// The default implementation decodes and then copies via `as_bytes`.
63    ///
64    /// # Errors
65    ///
66    /// Returns an error if the encoded data is corrupt or inconsistent.
67    fn decode_to_bytes_into(
68        encoded: &[u8],
69        original_len: usize,
70        output: &mut Vec<u8>,
71    ) -> Result<(), DryIceError> {
72        let decoded = Self::decode(encoded, original_len)?;
73        output.extend_from_slice(Self::as_bytes(&decoded));
74        Ok(())
75    }
76}
77
78/// A raw name — the full name bytes with no parsing.
79#[derive(Debug, Clone, PartialEq, Eq)]
80pub struct RawName(pub Vec<u8>);
81
82impl RawName {
83    /// The full name bytes.
84    #[must_use]
85    pub fn as_bytes(&self) -> &[u8] {
86        &self.0
87    }
88}
89
90/// Raw name storage. No transformation.
91#[derive(Debug, Clone, Copy, Default)]
92pub struct RawNameCodec;
93
94impl NameCodec for RawNameCodec {
95    const TYPE_TAG: [u8; 16] = *b"dryi:name:raw!!!";
96    const LOSSY: bool = false;
97    const IS_IDENTITY: bool = true;
98    type Decoded = RawName;
99
100    fn encode_into(name: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
101        output.extend_from_slice(name);
102        Ok(())
103    }
104
105    fn decode(encoded: &[u8], _original_len: usize) -> Result<RawName, DryIceError> {
106        Ok(RawName(encoded.to_vec()))
107    }
108
109    fn as_bytes(decoded: &RawName) -> &[u8] {
110        &decoded.0
111    }
112}
113
114/// An omitted name — names are dropped entirely.
115#[derive(Debug, Clone, PartialEq, Eq)]
116pub struct OmittedName;
117
118/// Omit names entirely. Encodes to empty, decodes to `OmittedName`.
119#[derive(Debug, Clone, Copy, Default)]
120pub struct OmittedNameCodec;
121
122impl NameCodec for OmittedNameCodec {
123    const TYPE_TAG: [u8; 16] = *b"dryi:name:omittd";
124    const LOSSY: bool = true;
125    type Decoded = OmittedName;
126
127    fn encode_into(_name: &[u8], _output: &mut Vec<u8>) -> Result<(), DryIceError> {
128        Ok(())
129    }
130
131    fn decode(_encoded: &[u8], _original_len: usize) -> Result<OmittedName, DryIceError> {
132        Ok(OmittedName)
133    }
134
135    fn as_bytes(_decoded: &OmittedName) -> &[u8] {
136        &[]
137    }
138}
139
140/// A name split on the first space into identifier and description.
141///
142/// FASTQ/FASTA names typically have the form:
143///
144/// ```text
145/// instrument:run:flowcell:lane:tile:x:y 1:N:0:ATCACG
146/// ^--- identifier ---^                  ^--- description ---^
147///                     ^ first space
148/// ```
149#[derive(Debug, Clone, PartialEq, Eq)]
150pub struct SplitName {
151    /// The identifier portion (before the first space).
152    pub id: Vec<u8>,
153    /// The description portion (after the first space), if any.
154    pub description: Vec<u8>,
155    /// The full reconstructed name bytes (cached for `as_bytes`).
156    full: Vec<u8>,
157}
158
159impl SplitName {
160    /// The identifier portion of the name.
161    #[must_use]
162    pub fn id(&self) -> &[u8] {
163        &self.id
164    }
165
166    /// The description portion of the name, if any.
167    #[must_use]
168    pub fn description(&self) -> &[u8] {
169        &self.description
170    }
171
172    /// The full reconstructed name bytes.
173    #[must_use]
174    pub fn as_bytes(&self) -> &[u8] {
175        &self.full
176    }
177}
178
179/// Split name codec. Splits on the first space into identifier and
180/// description, storing both with a length prefix for exact
181/// reconstruction.
182///
183/// On-disk layout per name:
184///
185/// ```text
186/// [id_len: u32 le] [id_bytes] [desc_bytes]
187/// ```
188#[derive(Debug, Clone, Copy, Default)]
189pub struct SplitNameCodec;
190
191impl NameCodec for SplitNameCodec {
192    const TYPE_TAG: [u8; 16] = *b"dryi:name:split!";
193    const LOSSY: bool = false;
194    type Decoded = SplitName;
195
196    fn encode_into(name: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
197        let split_pos = name.iter().position(|&b| b == b' ');
198
199        let (id, desc) = match split_pos {
200            Some(pos) => (&name[..pos], &name[pos + 1..]),
201            None => (name, &[] as &[u8]),
202        };
203
204        let id_len = u32::try_from(id.len()).map_err(|_| DryIceError::SectionOverflow {
205            field: "name identifier length",
206        })?;
207
208        output.extend_from_slice(&id_len.to_le_bytes());
209        output.extend_from_slice(id);
210        output.extend_from_slice(desc);
211
212        Ok(())
213    }
214
215    fn decode(encoded: &[u8], _original_len: usize) -> Result<SplitName, DryIceError> {
216        if encoded.len() < 4 {
217            return Err(DryIceError::CorruptBlockLayout {
218                message: "SplitNameCodec encoded buffer too short for id_len",
219            });
220        }
221
222        let id_len = u32::from_le_bytes([encoded[0], encoded[1], encoded[2], encoded[3]]) as usize;
223
224        let id_end = 4 + id_len;
225        if id_end > encoded.len() {
226            return Err(DryIceError::CorruptBlockLayout {
227                message: "SplitNameCodec id_len exceeds buffer",
228            });
229        }
230
231        let id = encoded[4..id_end].to_vec();
232        let description = encoded[id_end..].to_vec();
233
234        let full = if description.is_empty() {
235            id.clone()
236        } else {
237            let mut f = Vec::with_capacity(id.len() + 1 + description.len());
238            f.extend_from_slice(&id);
239            f.push(b' ');
240            f.extend_from_slice(&description);
241            f
242        };
243
244        Ok(SplitName {
245            id,
246            description,
247            full,
248        })
249    }
250
251    fn as_bytes(decoded: &SplitName) -> &[u8] {
252        &decoded.full
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    #[test]
261    fn raw_round_trip() {
262        let name = b"@instrument:run:flowcell 1:N:0:ATCACG";
263        let encoded = RawNameCodec::encode(name).expect("encode should succeed");
264        let decoded = RawNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
265        assert_eq!(decoded.as_bytes(), name);
266    }
267
268    #[test]
269    fn omitted_produces_empty() {
270        let name = b"@some_read_name";
271        let encoded = OmittedNameCodec::encode(name).expect("encode should succeed");
272        assert!(encoded.is_empty());
273        let decoded =
274            OmittedNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
275        assert_eq!(OmittedNameCodec::as_bytes(&decoded), b"");
276    }
277
278    #[test]
279    fn split_round_trip_with_space() {
280        let name = b"instrument:run:flowcell 1:N:0:ATCACG";
281        let encoded = SplitNameCodec::encode(name).expect("encode should succeed");
282        let decoded = SplitNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
283        assert_eq!(decoded.as_bytes(), name);
284        assert_eq!(decoded.id(), b"instrument:run:flowcell");
285        assert_eq!(decoded.description(), b"1:N:0:ATCACG");
286    }
287
288    #[test]
289    fn split_round_trip_without_space() {
290        let name = b"simple_read_name";
291        let encoded = SplitNameCodec::encode(name).expect("encode should succeed");
292        let decoded = SplitNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
293        assert_eq!(decoded.as_bytes(), name);
294        assert_eq!(decoded.id(), name.as_slice());
295        assert!(decoded.description().is_empty());
296    }
297
298    #[test]
299    fn split_round_trip_empty_name() {
300        let name = b"";
301        let encoded = SplitNameCodec::encode(name).expect("encode should succeed");
302        let decoded = SplitNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
303        assert_eq!(decoded.as_bytes(), name);
304    }
305
306    #[test]
307    fn split_round_trip_multiple_spaces() {
308        let name = b"id part1 part2 part3";
309        let encoded = SplitNameCodec::encode(name).expect("encode should succeed");
310        let decoded = SplitNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
311        assert_eq!(decoded.as_bytes(), name);
312        assert_eq!(decoded.id(), b"id");
313        assert_eq!(decoded.description(), b"part1 part2 part3");
314    }
315
316    #[test]
317    fn split_trailing_space_drops_empty_description() {
318        let name = b"id ";
319        let encoded = SplitNameCodec::encode(name).expect("encode should succeed");
320        let decoded = SplitNameCodec::decode(&encoded, name.len()).expect("decode should succeed");
321        // A trailing space with no description is normalized to just the id.
322        // This is intentional — the split codec treats the space as a delimiter,
323        // not as content.
324        assert_eq!(decoded.as_bytes(), b"id");
325        assert_eq!(decoded.id(), b"id");
326        assert!(decoded.description().is_empty());
327    }
328}