1use crate::error::DryIceError;
4
5pub trait SequenceCodec: Sized {
13 const TYPE_TAG: [u8; 16];
15
16 const LOSSY: bool;
18
19 const IS_IDENTITY: bool = false;
23
24 fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError>;
32
33 fn decode_into(
43 encoded: &[u8],
44 original_len: usize,
45 output: &mut Vec<u8>,
46 ) -> Result<(), DryIceError>;
47
48 fn encode(sequence: &[u8]) -> Result<Vec<u8>, DryIceError> {
57 let mut out = Vec::new();
58 Self::encode_into(sequence, &mut out)?;
59 Ok(out)
60 }
61
62 fn decode(encoded: &[u8], original_len: usize) -> Result<Vec<u8>, DryIceError> {
70 let mut out = Vec::new();
71 Self::decode_into(encoded, original_len, &mut out)?;
72 Ok(out)
73 }
74}
75
76#[derive(Debug, Clone, Copy, Default)]
79pub struct RawAsciiCodec;
80
81#[derive(Debug, Clone, Copy, Default)]
87pub struct OmittedSequenceCodec;
88
89impl SequenceCodec for OmittedSequenceCodec {
90 const TYPE_TAG: [u8; 16] = *b"dryi:seq:omittd!";
91 const LOSSY: bool = true;
92
93 fn encode_into(_sequence: &[u8], _output: &mut Vec<u8>) -> Result<(), DryIceError> {
94 Ok(())
95 }
96
97 fn decode_into(
98 _encoded: &[u8],
99 original_len: usize,
100 output: &mut Vec<u8>,
101 ) -> Result<(), DryIceError> {
102 if original_len != 0 {
103 return Err(DryIceError::CorruptBlockLayout {
104 message: "omitted sequence requires zero original length",
105 });
106 }
107 output.clear();
108 Ok(())
109 }
110}
111
112impl SequenceCodec for RawAsciiCodec {
113 const TYPE_TAG: [u8; 16] = *b"dryi:seq:raw-asc";
114 const LOSSY: bool = false;
115 const IS_IDENTITY: bool = true;
116
117 fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
118 output.extend_from_slice(sequence);
119 Ok(())
120 }
121
122 fn decode_into(
123 encoded: &[u8],
124 _original_len: usize,
125 output: &mut Vec<u8>,
126 ) -> Result<(), DryIceError> {
127 output.extend_from_slice(encoded);
128 Ok(())
129 }
130}
131
132#[derive(Debug, Clone, Copy, Default)]
150pub struct TwoBitExactCodec;
151
152impl SequenceCodec for TwoBitExactCodec {
153 const TYPE_TAG: [u8; 16] = *b"dryi:seq:2b-exct";
154 const LOSSY: bool = false;
155
156 fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
157 if sequence.is_empty() {
158 output.extend_from_slice(&0u32.to_le_bytes());
159 return Ok(());
160 }
161
162 let mut canonical = Vec::with_capacity(sequence.len());
163 let mut ambig_positions: Vec<u32> = Vec::new();
164 let mut ambig_bytes: Vec<u8> = Vec::new();
165
166 for (i, &base) in sequence.iter().enumerate() {
167 if is_canonical(base) {
168 canonical.push(base);
169 } else {
170 canonical.push(b'A');
171 let pos = u32::try_from(i).map_err(|_| DryIceError::SectionOverflow {
172 field: "ambiguity position",
173 })?;
174 ambig_positions.push(pos);
175 ambig_bytes.push(base);
176 }
177 }
178
179 let mut packed_bases: Vec<u64> = Vec::new();
180 bitnuc::twobit::encode(&canonical, &mut packed_bases).map_err(|_| {
181 DryIceError::InvalidSequenceInput {
182 message: "sequence contains bytes invalid for 2-bit encoding",
183 }
184 })?;
185
186 let ambig_count =
187 u32::try_from(ambig_positions.len()).map_err(|_| DryIceError::SectionOverflow {
188 field: "ambiguity count",
189 })?;
190
191 for word in &packed_bases {
192 output.extend_from_slice(&word.to_le_bytes());
193 }
194
195 output.extend_from_slice(&ambig_count.to_le_bytes());
196 for &pos in &ambig_positions {
197 output.extend_from_slice(&pos.to_le_bytes());
198 }
199 output.extend_from_slice(&ambig_bytes);
200
201 Ok(())
202 }
203
204 fn decode_into(
205 encoded: &[u8],
206 original_len: usize,
207 output: &mut Vec<u8>,
208 ) -> Result<(), DryIceError> {
209 let packed_word_count = original_len.div_ceil(32);
210 let packed_byte_len = packed_word_count * 8;
211
212 if encoded.len() < packed_byte_len + 4 {
213 return Err(DryIceError::CorruptBlockLayout {
214 message: "TwoBitExact encoded buffer too short",
215 });
216 }
217
218 let mut packed_words: Vec<u64> = Vec::with_capacity(packed_word_count);
219 for chunk in encoded[..packed_byte_len].chunks_exact(8) {
220 packed_words.push(u64::from_le_bytes([
221 chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
222 ]));
223 }
224
225 bitnuc::twobit::decode(&packed_words, original_len, output).map_err(|_| {
226 DryIceError::CorruptBlockLayout {
227 message: "failed to decode 2-bit packed sequence",
228 }
229 })?;
230
231 let sideband = &encoded[packed_byte_len..];
232 if sideband.len() < 4 {
233 return Err(DryIceError::CorruptBlockLayout {
234 message: "TwoBitExact sideband missing ambiguity count",
235 });
236 }
237
238 let ambig_count =
239 u32::from_le_bytes([sideband[0], sideband[1], sideband[2], sideband[3]]) as usize;
240
241 let positions_end = 4 + ambig_count * 4;
242 let iupac_end = positions_end + ambig_count;
243
244 if sideband.len() < iupac_end {
245 return Err(DryIceError::CorruptBlockLayout {
246 message: "TwoBitExact sideband truncated",
247 });
248 }
249
250 for i in 0..ambig_count {
251 let pos_offset = 4 + i * 4;
252 let pos = u32::from_le_bytes([
253 sideband[pos_offset],
254 sideband[pos_offset + 1],
255 sideband[pos_offset + 2],
256 sideband[pos_offset + 3],
257 ]) as usize;
258
259 let iupac_byte = sideband[positions_end + i];
260
261 if pos >= output.len() {
262 return Err(DryIceError::CorruptBlockLayout {
263 message: "TwoBitExact ambiguity position out of range",
264 });
265 }
266
267 output[pos] = iupac_byte;
268 }
269
270 Ok(())
271 }
272}
273
274#[derive(Debug, Clone, Copy, Default)]
292pub struct TwoBitLossyNCodec;
293
294impl SequenceCodec for TwoBitLossyNCodec {
295 const TYPE_TAG: [u8; 16] = *b"dryi:seq:2b-losN";
296 const LOSSY: bool = true;
297
298 fn encode_into(sequence: &[u8], output: &mut Vec<u8>) -> Result<(), DryIceError> {
299 if sequence.is_empty() {
300 output.extend_from_slice(&0u32.to_le_bytes());
301 return Ok(());
302 }
303
304 let mut canonical = Vec::with_capacity(sequence.len());
305 let mut ambig_positions: Vec<u32> = Vec::new();
306
307 for (i, &base) in sequence.iter().enumerate() {
308 if is_canonical(base) {
309 canonical.push(base);
310 } else {
311 canonical.push(b'A');
312 let pos = u32::try_from(i).map_err(|_| DryIceError::SectionOverflow {
313 field: "ambiguity position",
314 })?;
315 ambig_positions.push(pos);
316 }
317 }
318
319 let mut packed_bases: Vec<u64> = Vec::new();
320 bitnuc::twobit::encode(&canonical, &mut packed_bases).map_err(|_| {
321 DryIceError::InvalidSequenceInput {
322 message: "sequence contains bytes invalid for 2-bit encoding",
323 }
324 })?;
325
326 let ambig_count =
327 u32::try_from(ambig_positions.len()).map_err(|_| DryIceError::SectionOverflow {
328 field: "ambiguity count",
329 })?;
330
331 for word in &packed_bases {
332 output.extend_from_slice(&word.to_le_bytes());
333 }
334
335 output.extend_from_slice(&ambig_count.to_le_bytes());
336 for &pos in &ambig_positions {
337 output.extend_from_slice(&pos.to_le_bytes());
338 }
339
340 Ok(())
341 }
342
343 fn decode_into(
344 encoded: &[u8],
345 original_len: usize,
346 output: &mut Vec<u8>,
347 ) -> Result<(), DryIceError> {
348 let packed_word_count = original_len.div_ceil(32);
349 let packed_byte_len = packed_word_count * 8;
350
351 if encoded.len() < packed_byte_len + 4 {
352 return Err(DryIceError::CorruptBlockLayout {
353 message: "TwoBitLossyN encoded buffer too short",
354 });
355 }
356
357 let mut packed_words: Vec<u64> = Vec::with_capacity(packed_word_count);
358 for chunk in encoded[..packed_byte_len].chunks_exact(8) {
359 packed_words.push(u64::from_le_bytes([
360 chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
361 ]));
362 }
363
364 bitnuc::twobit::decode(&packed_words, original_len, output).map_err(|_| {
365 DryIceError::CorruptBlockLayout {
366 message: "failed to decode 2-bit packed sequence",
367 }
368 })?;
369
370 let sideband = &encoded[packed_byte_len..];
371 if sideband.len() < 4 {
372 return Err(DryIceError::CorruptBlockLayout {
373 message: "TwoBitLossyN sideband missing ambiguity count",
374 });
375 }
376
377 let ambig_count =
378 u32::from_le_bytes([sideband[0], sideband[1], sideband[2], sideband[3]]) as usize;
379
380 let positions_end = 4 + ambig_count * 4;
381 if sideband.len() < positions_end {
382 return Err(DryIceError::CorruptBlockLayout {
383 message: "TwoBitLossyN sideband truncated",
384 });
385 }
386
387 for i in 0..ambig_count {
388 let pos_offset = 4 + i * 4;
389 let pos = u32::from_le_bytes([
390 sideband[pos_offset],
391 sideband[pos_offset + 1],
392 sideband[pos_offset + 2],
393 sideband[pos_offset + 3],
394 ]) as usize;
395
396 if pos >= output.len() {
397 return Err(DryIceError::CorruptBlockLayout {
398 message: "TwoBitLossyN ambiguity position out of range",
399 });
400 }
401
402 output[pos] = b'N';
403 }
404
405 Ok(())
406 }
407}
408
409fn is_canonical(base: u8) -> bool {
410 matches!(base, b'A' | b'a' | b'C' | b'c' | b'G' | b'g' | b'T' | b't')
411}
412
413#[cfg(test)]
414mod tests {
415 use super::*;
416
417 #[test]
418 fn raw_ascii_round_trip() {
419 let seq = b"ACGTACGT";
420 let encoded = RawAsciiCodec::encode(seq).expect("encode should succeed");
421 let decoded = RawAsciiCodec::decode(&encoded, seq.len()).expect("decode should succeed");
422 assert_eq!(&decoded, seq);
423 }
424
425 #[test]
426 fn two_bit_exact_round_trip_canonical_only() {
427 let seq = b"ACGTACGT";
428 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
429 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
430 assert_eq!(&decoded, seq);
431 }
432
433 #[test]
434 fn two_bit_exact_round_trip_with_ambiguity() {
435 let seq = b"ACNGTRYACGT";
436 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
437 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
438 assert_eq!(&decoded, seq);
439 }
440
441 #[test]
442 fn two_bit_exact_round_trip_all_ambiguous() {
443 let seq = b"NNNNNN";
444 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
445 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
446 assert_eq!(&decoded, seq);
447 }
448
449 #[test]
450 fn two_bit_exact_round_trip_single_base() {
451 let seq = b"G";
452 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
453 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
454 assert_eq!(&decoded, seq);
455 }
456
457 #[test]
458 fn two_bit_exact_round_trip_non_multiple_of_32() {
459 let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGTACG";
460 assert_eq!(seq.len(), 35);
461 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
462 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
463 assert_eq!(&decoded, seq);
464 }
465
466 #[test]
467 fn two_bit_exact_round_trip_empty() {
468 let seq = b"";
469 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
470 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
471 assert_eq!(&decoded, seq);
472 }
473
474 #[test]
475 fn two_bit_exact_lowercase_normalizes_to_uppercase() {
476 let seq = b"acgtNacgt";
477 let encoded = TwoBitExactCodec::encode(seq).expect("encode should succeed");
478 let decoded = TwoBitExactCodec::decode(&encoded, seq.len()).expect("decode should succeed");
479 assert_eq!(decoded, b"ACGTNACGT");
480 }
481
482 #[test]
483 fn two_bit_lossy_n_collapses_ambiguity_to_n() {
484 let seq = b"ACNGTRYACGT";
485 let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
486 let decoded =
487 TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
488 assert_eq!(decoded, b"ACNGTNNACGT");
489 }
490
491 #[test]
492 fn two_bit_lossy_n_canonical_only() {
493 let seq = b"ACGTACGT";
494 let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
495 let decoded =
496 TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
497 assert_eq!(&decoded, seq);
498 }
499
500 #[test]
501 fn two_bit_lossy_n_all_ambiguous() {
502 let seq = b"NRYSW";
503 let encoded = TwoBitLossyNCodec::encode(seq).expect("encode should succeed");
504 let decoded =
505 TwoBitLossyNCodec::decode(&encoded, seq.len()).expect("decode should succeed");
506 assert_eq!(decoded, b"NNNNN");
507 }
508
509 #[test]
510 fn two_bit_lossy_n_is_more_compact_than_exact() {
511 let seq = b"ACNGTRYACGT";
512 let exact = TwoBitExactCodec::encode(seq).expect("exact encode");
513 let lossy = TwoBitLossyNCodec::encode(seq).expect("lossy encode");
514 assert!(
515 lossy.len() < exact.len(),
516 "lossy should be more compact: lossy={}, exact={}",
517 lossy.len(),
518 exact.len()
519 );
520 }
521}