Skip to main content

dryice/block/
sequence.rs

1//! Sequence codec trait and built-in implementations.
2
3use crate::error::DryIceError;
4
5/// A sequence encoding strategy for `dryice` blocks.
6///
7/// Implementors define how raw ASCII nucleotide sequences are encoded
8/// for on-disk storage and decoded back. The crate provides
9/// [`RawAsciiCodec`] and [`TwoBitExactCodec`] as built-in
10/// implementations, but users can implement this trait for custom
11/// encodings.
12pub trait SequenceCodec: Sized {
13    /// Stable type tag written into block headers.
14    const TYPE_TAG: [u8; 16];
15
16    /// Whether this encoding is lossy.
17    const LOSSY: bool;
18
19    /// Whether the encoded form is identical to the raw input bytes.
20    /// When true, the decoder can skip decoding and return slices
21    /// directly into the block's payload buffer.
22    const IS_IDENTITY: bool = false;
23
24    /// Encode a raw ASCII nucleotide sequence, appending the encoded
25    /// bytes directly into the provided output buffer.
26    ///
27    /// # Errors
28    ///
29    /// Returns an error if the sequence contains bytes that are invalid
30    /// for this encoding.
31    fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError>;
32
33    /// Decode an encoded buffer, appending the decoded ASCII bytes
34    /// directly into the provided output buffer.
35    ///
36    /// `original_len` is the number of bases in the original sequence,
37    /// needed because some encodings pad or compress.
38    ///
39    /// # Errors
40    ///
41    /// Returns an error if the encoded data is corrupt or inconsistent.
42    fn decode_into(
43        encoded: &[u8],
44        original_len: usize,
45        output: &mut Vec<u8>,
46    ) -> Result<(), DryIceError>;
47
48    /// Encode a sequence, returning a new allocated buffer.
49    ///
50    /// This is a convenience wrapper around [`encode_into`](Self::encode_into).
51    ///
52    /// # Errors
53    ///
54    /// Returns an error if the sequence contains bytes that are invalid
55    /// for this encoding.
56    fn encode(sequence: &[u8]) -> Result<Vec<u8>, DryIceError> {
57        let mut out = Vec::new();
58        Self::encode_into(sequence, &mut out)?;
59        Ok(out)
60    }
61
62    /// Decode an encoded buffer, returning a new allocated buffer.
63    ///
64    /// This is a convenience wrapper around [`decode_into`](Self::decode_into).
65    ///
66    /// # Errors
67    ///
68    /// Returns an error if the encoded data is corrupt or inconsistent.
69    fn decode(encoded: &[u8], original_len: usize) -> Result<Vec<u8>, DryIceError> {
70        let mut out = Vec::new();
71        Self::decode_into(encoded, original_len, &mut out)?;
72        Ok(out)
73    }
74}
75
76/// Raw ASCII sequence storage. No transformation — fastest possible
77/// encode and decode, largest on-disk footprint.
78#[derive(Debug, Clone, Copy, Default)]
79pub struct RawAsciiCodec;
80
81/// Omitted sequence storage.
82///
83/// This codec stores no sequence payload bytes and always decodes to an empty
84/// byte string. It is intended for payload-light temporary files where
85/// sequence data is deliberately absent.
86#[derive(Debug, Clone, Copy, Default)]
87pub struct OmittedSequenceCodec;
88
89impl SequenceCodec for OmittedSequenceCodec {
90    const TYPE_TAG: [u8; 16] = *b"dryi:seq:omittd!";
91    const LOSSY: bool = true;
92
93    fn encode_into(_sequence: &[u8], _output: &mut Vec<u8>) -> Result<(), DryIceError> {
94        Ok(())
95    }
96
97    fn decode_into(
98        _encoded: &[u8],
99        original_len: usize,
100        output: &mut Vec<u8>,
101    ) -> Result<(), DryIceError> {
102        if original_len != 0 {
103            return Err(DryIceError::CorruptBlockLayout {
104                message: "omitted sequence requires zero original length",
105            });
106        }
107        output.clear();
108        Ok(())
109    }
110}
111
112impl SequenceCodec for RawAsciiCodec {
113    const TYPE_TAG: [u8; 16] = *b"dryi:seq:raw-asc";
114    const LOSSY: bool = false;
115    const IS_IDENTITY: bool = true;
116
117    fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
118        output.extend_from_slice(sequence);
119        Ok(())
120    }
121
122    fn decode_into(
123        encoded: &[u8],
124        _original_len: usize,
125        output: &mut Vec<u8>,
126    ) -> Result<(), DryIceError> {
127        output.extend_from_slice(encoded);
128        Ok(())
129    }
130}
131
132/// Exact 2-bit sequence encoding with sparse ambiguity sideband.
133///
134/// Canonical bases (A, C, G, T) are packed into 2 bits each via
135/// `bitnuc` with SIMD acceleration. Non-canonical IUPAC bases are
136/// stored in a sparse sideband for exact reconstruction.
137///
138/// On-disk layout per record:
139///
140/// ```text
141/// [2-bit packed bases as le u64s]
142/// [ambiguity_count: u32 le]
143/// [positions: u32 le each]
144/// [IUPAC bytes: u8 each]
145/// ```
146///
147/// Lowercase canonical bases are normalized to uppercase during
148/// encoding. Ambiguous bases preserve their original byte value.
149#[derive(Debug, Clone, Copy, Default)]
150pub struct TwoBitExactCodec;
151
152impl SequenceCodec for TwoBitExactCodec {
153    const TYPE_TAG: [u8; 16] = *b"dryi:seq:2b-exct";
154    const LOSSY: bool = false;
155
156    fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
157        if sequence.is_empty() {
158            output.extend_from_slice(&0u32.to_le_bytes());
159            return Ok(());
160        }
161
162        let mut canonical = Vec::with_capacity(sequence.len());
163        let mut ambig_positions: Vec<u32> = Vec::new();
164        let mut ambig_bytes: Vec<u8> = Vec::new();
165
166        for (i, &base) in sequence.iter().enumerate() {
167            if is_canonical(base) {
168                canonical.push(base);
169            } else {
170                canonical.push(b'A');
171                let pos = u32::try_from(i).map_err(|_| DryIceError::SectionOverflow {
172                    field: "ambiguity position",
173                })?;
174                ambig_positions.push(pos);
175                ambig_bytes.push(base);
176            }
177        }
178
179        let mut packed_bases: Vec<u64> = Vec::new();
180        bitnuc::twobit::encode(&canonical, &mut packed_bases).map_err(|_| {
181            DryIceError::InvalidSequenceInput {
182                message: "sequence contains bytes invalid for 2-bit encoding",
183            }
184        })?;
185
186        let ambig_count =
187            u32::try_from(ambig_positions.len()).map_err(|_| DryIceError::SectionOverflow {
188                field: "ambiguity count",
189            })?;
190
191        for word in &packed_bases {
192            output.extend_from_slice(&word.to_le_bytes());
193        }
194
195        output.extend_from_slice(&ambig_count.to_le_bytes());
196        for &pos in &ambig_positions {
197            output.extend_from_slice(&pos.to_le_bytes());
198        }
199        output.extend_from_slice(&ambig_bytes);
200
201        Ok(())
202    }
203
204    fn decode_into(
205        encoded: &[u8],
206        original_len: usize,
207        output: &mut Vec<u8>,
208    ) -> Result<(), DryIceError> {
209        let packed_word_count = original_len.div_ceil(32);
210        let packed_byte_len = packed_word_count * 8;
211
212        if encoded.len() < packed_byte_len + 4 {
213            return Err(DryIceError::CorruptBlockLayout {
214                message: "TwoBitExact encoded buffer too short",
215            });
216        }
217
218        let mut packed_words: Vec<u64> = Vec::with_capacity(packed_word_count);
219        for chunk in encoded[..packed_byte_len].chunks_exact(8) {
220            packed_words.push(u64::from_le_bytes([
221                chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
222            ]));
223        }
224
225        bitnuc::twobit::decode(&packed_words, original_len, output).map_err(|_| {
226            DryIceError::CorruptBlockLayout {
227                message: "failed to decode 2-bit packed sequence",
228            }
229        })?;
230
231        let sideband = &encoded[packed_byte_len..];
232        if sideband.len() < 4 {
233            return Err(DryIceError::CorruptBlockLayout {
234                message: "TwoBitExact sideband missing ambiguity count",
235            });
236        }
237
238        let ambig_count =
239            u32::from_le_bytes([sideband[0], sideband[1], sideband[2], sideband[3]]) as usize;
240
241        let positions_end = 4 + ambig_count * 4;
242        let iupac_end = positions_end + ambig_count;
243
244        if sideband.len() < iupac_end {
245            return Err(DryIceError::CorruptBlockLayout {
246                message: "TwoBitExact sideband truncated",
247            });
248        }
249
250        for i in 0..ambig_count {
251            let pos_offset = 4 + i * 4;
252            let pos = u32::from_le_bytes([
253                sideband[pos_offset],
254                sideband[pos_offset + 1],
255                sideband[pos_offset + 2],
256                sideband[pos_offset + 3],
257            ]) as usize;
258
259            let iupac_byte = sideband[positions_end + i];
260
261            if pos >= output.len() {
262                return Err(DryIceError::CorruptBlockLayout {
263                    message: "TwoBitExact ambiguity position out of range",
264                });
265            }
266
267            output[pos] = iupac_byte;
268        }
269
270        Ok(())
271    }
272}
273
274/// Lossy 2-bit sequence encoding that collapses all ambiguous bases to `N`.
275///
276/// Like [`TwoBitExactCodec`], canonical bases are packed into 2 bits
277/// via `bitnuc`. However, instead of preserving the exact IUPAC symbol
278/// for each ambiguous position, all non-canonical bases are replaced
279/// with `N` on decode. The sideband stores only positions, not original
280/// symbols.
281///
282/// This is explicitly lossy: `R`, `Y`, `S`, `W`, etc. all become `N`.
283///
284/// On-disk layout per record:
285///
286/// ```text
287/// [2-bit packed bases as le u64s]
288/// [ambiguity_count: u32 le]
289/// [positions: u32 le each]
290/// ```
291#[derive(Debug, Clone, Copy, Default)]
292pub struct TwoBitLossyNCodec;
293
294impl SequenceCodec for TwoBitLossyNCodec {
295    const TYPE_TAG: [u8; 16] = *b"dryi:seq:2b-losN";
296    const LOSSY: bool = true;
297
298    fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
299        if sequence.is_empty() {
300            output.extend_from_slice(&0u32.to_le_bytes());
301            return Ok(());
302        }
303
304        let mut canonical = Vec::with_capacity(sequence.len());
305        let mut ambig_positions: Vec<u32> = Vec::new();
306
307        for (i, &base) in sequence.iter().enumerate() {
308            if is_canonical(base) {
309                canonical.push(base);
310            } else {
311                canonical.push(b'A');
312                let pos = u32::try_from(i).map_err(|_| DryIceError::SectionOverflow {
313                    field: "ambiguity position",
314                })?;
315                ambig_positions.push(pos);
316            }
317        }
318
319        let mut packed_bases: Vec<u64> = Vec::new();
320        bitnuc::twobit::encode(&canonical, &mut packed_bases).map_err(|_| {
321            DryIceError::InvalidSequenceInput {
322                message: "sequence contains bytes invalid for 2-bit encoding",
323            }
324        })?;
325
326        let ambig_count =
327            u32::try_from(ambig_positions.len()).map_err(|_| DryIceError::SectionOverflow {
328                field: "ambiguity count",
329            })?;
330
331        for word in &packed_bases {
332            output.extend_from_slice(&word.to_le_bytes());
333        }
334
335        output.extend_from_slice(&ambig_count.to_le_bytes());
336        for &pos in &ambig_positions {
337            output.extend_from_slice(&pos.to_le_bytes());
338        }
339
340        Ok(())
341    }
342
343    fn decode_into(
344        encoded: &[u8],
345        original_len: usize,
346        output: &mut Vec<u8>,
347    ) -> Result<(), DryIceError> {
348        let packed_word_count = original_len.div_ceil(32);
349        let packed_byte_len = packed_word_count * 8;
350
351        if encoded.len() < packed_byte_len + 4 {
352            return Err(DryIceError::CorruptBlockLayout {
353                message: "TwoBitLossyN encoded buffer too short",
354            });
355        }
356
357        let mut packed_words: Vec<u64> = Vec::with_capacity(packed_word_count);
358        for chunk in encoded[..packed_byte_len].chunks_exact(8) {
359            packed_words.push(u64::from_le_bytes([
360                chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
361            ]));
362        }
363
364        bitnuc::twobit::decode(&packed_words, original_len, output).map_err(|_| {
365            DryIceError::CorruptBlockLayout {
366                message: "failed to decode 2-bit packed sequence",
367            }
368        })?;
369
370        let sideband = &encoded[packed_byte_len..];
371        if sideband.len() < 4 {
372            return Err(DryIceError::CorruptBlockLayout {
373                message: "TwoBitLossyN sideband missing ambiguity count",
374            });
375        }
376
377        let ambig_count =
378            u32::from_le_bytes([sideband[0], sideband[1], sideband[2], sideband[3]]) as usize;
379
380        let positions_end = 4 + ambig_count * 4;
381        if sideband.len() < positions_end {
382            return Err(DryIceError::CorruptBlockLayout {
383                message: "TwoBitLossyN sideband truncated",
384            });
385        }
386
387        for i in 0..ambig_count {
388            let pos_offset = 4 + i * 4;
389            let pos = u32::from_le_bytes([
390                sideband[pos_offset],
391                sideband[pos_offset + 1],
392                sideband[pos_offset + 2],
393                sideband[pos_offset + 3],
394            ]) as usize;
395
396            if pos >= output.len() {
397                return Err(DryIceError::CorruptBlockLayout {
398                    message: "TwoBitLossyN ambiguity position out of range",
399                });
400            }
401
402            output[pos] = b'N';
403        }
404
405        Ok(())
406    }
407}
408
409fn is_canonical(base: u8) -> bool {
410    matches!(base, b'A' | b'a' | b'C' | b'c' | b'G' | b'g' | b'T' | b't')
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416
417    #[test]
418    fn raw_ascii_round_trip() {
419        let seq = b"ACGTACGT";
420        let encoded = RawAsciiCodec::encode(seq).expect("encode should succeed");
421        let decoded = RawAsciiCodec::decode(&encoded, seq.len()).expect("decode should succeed");
422        assert_eq!(&decoded, seq);
423    }
424
425    #[test]
426    fn two_bit_exact_round_trip_canonical_only() {
427        let seq = b"ACGTACGT";
428        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
429        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
430        assert_eq!(&decoded, seq);
431    }
432
433    #[test]
434    fn two_bit_exact_round_trip_with_ambiguity() {
435        let seq = b"ACNGTRYACGT";
436        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
437        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
438        assert_eq!(&decoded, seq);
439    }
440
441    #[test]
442    fn two_bit_exact_round_trip_all_ambiguous() {
443        let seq = b"NNNNNN";
444        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
445        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
446        assert_eq!(&decoded, seq);
447    }
448
449    #[test]
450    fn two_bit_exact_round_trip_single_base() {
451        let seq = b"G";
452        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
453        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
454        assert_eq!(&decoded, seq);
455    }
456
457    #[test]
458    fn two_bit_exact_round_trip_non_multiple_of_32() {
459        let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACG";
460        assert_eq!(seq.len(), 35);
461        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
462        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
463        assert_eq!(&decoded, seq);
464    }
465
466    #[test]
467    fn two_bit_exact_round_trip_empty() {
468        let seq = b"";
469        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
470        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
471        assert_eq!(&decoded, seq);
472    }
473
474    #[test]
475    fn two_bit_exact_lowercase_normalizes_to_uppercase() {
476        let seq = b"acgtNacgt";
477        let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
478        let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
479        assert_eq!(decoded, b"ACGTNACGT");
480    }
481
482    #[test]
483    fn two_bit_lossy_n_collapses_ambiguity_to_n() {
484        let seq = b"ACNGTRYACGT";
485        let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
486        let decoded =
487            TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
488        assert_eq!(decoded, b"ACNGTNNACGT");
489    }
490
491    #[test]
492    fn two_bit_lossy_n_canonical_only() {
493        let seq = b"ACGTACGT";
494        let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
495        let decoded =
496            TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
497        assert_eq!(&decoded, seq);
498    }
499
500    #[test]
501    fn two_bit_lossy_n_all_ambiguous() {
502        let seq = b"NRYSW";
503        let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
504        let decoded =
505            TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
506        assert_eq!(decoded, b"NNNNN");
507    }
508
509    #[test]
510    fn two_bit_lossy_n_is_more_compact_than_exact() {
511        let seq = b"ACNGTRYACGT";
512        let exact = TwoBitExactCodec::encode(seq).expect("exact encode");
513        let lossy = TwoBitLossyNCodec::encode(seq).expect("lossy encode");
514        assert!(
515            lossy.len() < exact.len(),
516            "lossy should be more compact: lossy={}, exact={}",
517            lossy.len(),
518            exact.len()
519        );
520    }
521}