Skip to main content

dryice/block/
quality.rs

1//! Quality codec trait and built-in implementations.
2
3use crate::error::DryIceError;
4
5/// A quality score encoding strategy for `dryice` blocks.
6///
7/// Implementors define how raw quality score bytes are encoded for
8/// on-disk storage and decoded back. The crate provides
9/// [`RawQualityCodec`] and [`BinnedQualityCodec`] as built-in
10/// implementations, but users can implement this trait for custom
11/// encodings.
12pub trait QualityCodec: Sized {
13    /// Stable type tag written into block headers.
14    const TYPE_TAG: [u8; 16];
15
16    /// Whether this encoding is lossy.
17    const LOSSY: bool;
18
19    /// Whether the encoded form is identical to the raw input bytes.
20    const IS_IDENTITY: bool = false;
21
22    /// Encode raw quality score bytes, appending the encoded bytes
23    /// directly into the provided output buffer.
24    ///
25    /// # Errors
26    ///
27    /// Returns an error if the quality data is invalid for this encoding.
28    fn encode_into(quality: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError>;
29
30    /// Decode an encoded buffer, appending the decoded quality bytes
31    /// directly into the provided output buffer.
32    ///
33    /// `original_len` is the number of quality scores in the original
34    /// record, needed because some encodings may compress.
35    ///
36    /// # Errors
37    ///
38    /// Returns an error if the encoded data is corrupt or inconsistent.
39    fn decode_into(
40        encoded: &[u8],
41        original_len: usize,
42        output: &mut Vec<u8>,
43    ) -> Result<(), DryIceError>;
44
45    /// Encode quality scores, returning a new allocated buffer.
46    ///
47    /// # Errors
48    ///
49    /// Returns an error if the quality data is invalid for this encoding.
50    fn encode(quality: &[u8]) -> Result<Vec<u8>, DryIceError> {
51        let mut out = Vec::new();
52        Self::encode_into(quality, &mut out)?;
53        Ok(out)
54    }
55
56    /// Decode an encoded buffer, returning a new allocated buffer.
57    ///
58    /// # Errors
59    ///
60    /// Returns an error if the encoded data is corrupt or inconsistent.
61    fn decode(encoded: &[u8], original_len: usize) -> Result<Vec<u8>, DryIceError> {
62        let mut out = Vec::new();
63        Self::decode_into(encoded, original_len, &mut out)?;
64        Ok(out)
65    }
66}
67
68/// Raw quality score storage. No transformation.
69#[derive(Debug, Clone, Copy, Default)]
70pub struct RawQualityCodec;
71
72impl QualityCodec for RawQualityCodec {
73    const TYPE_TAG: [u8; 16] = *b"dryi:qual:raw!!!";
74    const LOSSY: bool = false;
75    const IS_IDENTITY: bool = true;
76
77    fn encode_into(quality: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
78        output.extend_from_slice(quality);
79        Ok(())
80    }
81
82    fn decode_into(
83        encoded: &[u8],
84        _original_len: usize,
85        output: &mut Vec<u8>,
86    ) -> Result<(), DryIceError> {
87        output.extend_from_slice(encoded);
88        Ok(())
89    }
90}
91
92/// Illumina-style 8-level quality score binning.
93///
94/// This is an explicitly lossy encoding that maps Phred quality scores
95/// into 8 bins, reducing entropy for better downstream compression
96/// while preserving the most important quality distinctions.
97///
98/// Bin boundaries and representative values:
99///
100/// ```text
101/// Phred  0-1   → 0
102/// Phred  2-9   → 6
103/// Phred 10-19  → 15
104/// Phred 20-24  → 22
105/// Phred 25-29  → 27
106/// Phred 30-34  → 33
107/// Phred 35-39  → 37
108/// Phred 40+    → 40
109/// ```
110///
111/// Quality bytes are assumed to be Phred+33 encoded (standard Sanger/Illumina
112/// 1.8+ encoding). The binned output is also Phred+33 encoded.
113#[derive(Debug, Clone, Copy, Default)]
114pub struct BinnedQualityCodec;
115
116const PHRED_OFFSET: u8 = 33;
117
118fn bin_phred(phred: u8) -> u8 {
119    match phred {
120        0..=1 => 0,
121        2..=9 => 6,
122        10..=19 => 15,
123        20..=24 => 22,
124        25..=29 => 27,
125        30..=34 => 33,
126        35..=39 => 37,
127        _ => 40,
128    }
129}
130
131impl QualityCodec for BinnedQualityCodec {
132    const TYPE_TAG: [u8; 16] = *b"dryi:qual:binned";
133    const LOSSY: bool = true;
134
135    fn encode_into(quality: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
136        output.extend(quality.iter().map(|&q| {
137            let phred = q.saturating_sub(PHRED_OFFSET);
138            bin_phred(phred) + PHRED_OFFSET
139        }));
140        Ok(())
141    }
142
143    fn decode_into(
144        encoded: &[u8],
145        _original_len: usize,
146        output: &mut Vec<u8>,
147    ) -> Result<(), DryIceError> {
148        output.extend_from_slice(encoded);
149        Ok(())
150    }
151}
152
153/// An omitted quality codec that produces and expects empty quality sections.
154#[derive(Debug, Clone, Copy, Default)]
155pub struct OmittedQualityCodec;
156
157impl QualityCodec for OmittedQualityCodec {
158    const TYPE_TAG: [u8; 16] = *b"dryi:qual:omittd";
159    const LOSSY: bool = true;
160
161    fn encode_into(_quality: &[u8], _output: &mut Vec<u8>) -> Result<(), DryIceError> {
162        Ok(())
163    }
164
165    fn decode_into(
166        _encoded: &[u8],
167        _original_len: usize,
168        _output: &mut Vec<u8>,
169    ) -> Result<(), DryIceError> {
170        Ok(())
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn raw_round_trip() {
180        let qual = b"!!!!####";
181        let encoded = RawQualityCodec::encode(qual).expect("encode should succeed");
182        let decoded = RawQualityCodec::decode(&encoded, qual.len()).expect("decode should succeed");
183        assert_eq!(&decoded, qual);
184    }
185
186    #[test]
187    fn binned_produces_valid_phred33() {
188        let qual: Vec<u8> = (33..=73).collect();
189        let encoded = BinnedQualityCodec::encode(&qual).expect("encode should succeed");
190        for &q in &encoded {
191            assert!(
192                q >= PHRED_OFFSET,
193                "binned quality should be >= Phred+33 offset"
194            );
195        }
196    }
197
198    #[test]
199    fn binned_is_idempotent() {
200        let qual: Vec<u8> = (33..=73).collect();
201        let once = BinnedQualityCodec::encode(&qual).expect("first encode");
202        let twice = BinnedQualityCodec::encode(&once).expect("second encode");
203        assert_eq!(once, twice, "binning should be idempotent");
204    }
205
206    #[test]
207    fn binned_preserves_length() {
208        let qual = b"!!!!!!!!!!!";
209        let encoded = BinnedQualityCodec::encode(qual).expect("encode should succeed");
210        assert_eq!(encoded.len(), qual.len());
211    }
212
213    #[test]
214    fn binned_high_quality_bins_correctly() {
215        let q40 = vec![40 + PHRED_OFFSET];
216        let encoded = BinnedQualityCodec::encode(&q40).expect("encode should succeed");
217        assert_eq!(encoded[0], 40 + PHRED_OFFSET);
218    }
219
220    #[test]
221    fn omitted_produces_empty() {
222        let qual = b"!!!!####";
223        let encoded = OmittedQualityCodec::encode(qual).expect("encode should succeed");
224        assert!(encoded.is_empty());
225    }
226}