noodles_cram/
lib.rs

1//! **noodles-cram** handles the reading and writing of the CRAM format.
2
3#[cfg(feature = "async")]
4pub mod r#async;
5
6pub mod codecs;
7pub mod container;
8pub mod crai;
9pub mod file_definition;
10pub mod fs;
11mod huffman;
12pub mod io;
13pub mod record;
14
15use md5::{Digest, Md5};
16
17pub use self::{file_definition::FileDefinition, record::Record};
18
19const MAGIC_NUMBER: [u8; 4] = *b"CRAM";
20const MD5_OUTPUT_SIZE: usize = 16;
21
22// _Sequence Alignment/Map Format Specification_ (2021-06-03) § 1.3.2 "Reference MD5 calculation"
23fn calculate_normalized_sequence_digest(mut sequence: &[u8]) -> [u8; MD5_OUTPUT_SIZE] {
24    const MD5_BLOCK_SIZE: usize = 64;
25    const CHUNK_SIZE: usize = 8 * MD5_BLOCK_SIZE;
26
27    let mut hasher = Md5::new();
28    let mut buf = [0; CHUNK_SIZE];
29
30    while !sequence.is_empty() {
31        let n = sequence
32            .iter()
33            .position(|b| !b.is_ascii_graphic() || b.is_ascii_lowercase())
34            .unwrap_or(sequence.len());
35
36        hasher.update(&sequence[..n]);
37        sequence = &sequence[n..];
38
39        // "All lowercase characters are converted to uppercase."
40        loop {
41            let mut n = 0;
42
43            for (src, dst) in sequence
44                .iter()
45                .take_while(|b| b.is_ascii_lowercase())
46                .zip(&mut buf)
47            {
48                *dst = src.to_ascii_uppercase();
49                n += 1;
50            }
51
52            if n == 0 {
53                break;
54            }
55
56            hasher.update(&buf[..n]);
57            sequence = &sequence[n..];
58        }
59
60        // "All characters outside of the inclusive range 33 ('!') to 126 ('~') are stripped out."
61        let n = sequence
62            .iter()
63            .position(|b| b.is_ascii_graphic())
64            .unwrap_or(sequence.len());
65
66        sequence = &sequence[n..];
67    }
68
69    hasher.finalize().into()
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[test]
77    fn test_calculate_normalized_sequence_digest() {
78        assert_eq!(
79            calculate_normalized_sequence_digest(b"ACGT"),
80            [
81                0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
82                0x04, 0x3e
83            ]
84        );
85
86        assert_eq!(
87            calculate_normalized_sequence_digest(b" AC\tgt\n"),
88            [
89                0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
90                0x04, 0x3e
91            ]
92        );
93
94        // _Sequence Alignment/Map Format Specification_ (2024-11-06) § 1.3.2 "Reference MD5
95        // calculation"
96        assert_eq!(
97            calculate_normalized_sequence_digest(b"ACGT ACGT ACGT\nacgt acgt acgt\n... 12345 !!!"),
98            [
99                0xdf, 0xab, 0xdb, 0xb3, 0x6e, 0x23, 0x9a, 0x6d, 0xa8, 0x89, 0x57, 0x84, 0x1f, 0x32,
100                0xb8, 0xe4
101            ]
102        );
103    }
104}