Skip to main content

jellyfish_reader/
binary.rs

1use std::io::Read;
2
3use crate::error::{Error, Result};
4use crate::header::FileHeader;
5use crate::mer::MerDna;
6
7/// Sequential reader for Jellyfish binary/sorted format files.
8///
9/// Iterates over all (k-mer, count) pairs stored in a binary Jellyfish database.
10///
11/// # Examples
12///
13/// ```no_run
14/// use std::fs::File;
15/// use std::io::BufReader;
16/// use jellyfish_reader::{FileHeader, BinaryReader};
17///
18/// let file = File::open("output.jf").unwrap();
19/// let mut reader = BufReader::new(file);
20/// let header = FileHeader::read(&mut reader).unwrap();
21///
22/// let binary_reader = BinaryReader::new(reader, &header).unwrap();
23/// for result in binary_reader {
24///     let (mer, count) = result.unwrap();
25///     println!("{}: {}", mer, count);
26/// }
27/// ```
28pub struct BinaryReader<R: Read> {
29    reader: R,
30    k: usize,
31    key_buf: Vec<u8>,
32    val_buf: Vec<u8>,
33}
34
35impl<R: Read> BinaryReader<R> {
36    /// Create a new binary reader from a reader positioned at the start of data.
37    ///
38    /// The reader should be positioned right after the header (i.e., `FileHeader::read`
39    /// has already been called on it).
40    pub fn new(reader: R, header: &FileHeader) -> Result<Self> {
41        let key_len_bytes = header
42            .key_bytes()
43            .ok_or_else(|| Error::MissingField("key_len".to_string()))?;
44        let val_len_bytes = header
45            .data_val_len()
46            .ok_or_else(|| Error::MissingField("counter_len or val_len".to_string()))?;
47        let k = header
48            .k()
49            .ok_or_else(|| Error::MissingField("key_len".to_string()))?;
50
51        Ok(Self {
52            reader,
53            k,
54            key_buf: vec![0u8; key_len_bytes],
55            val_buf: vec![0u8; val_len_bytes],
56        })
57    }
58
59    /// Create a binary reader with explicit parameters.
60    pub fn with_params(reader: R, k: usize, key_len_bytes: usize, val_len_bytes: usize) -> Self {
61        Self {
62            reader,
63            k,
64            key_buf: vec![0u8; key_len_bytes],
65            val_buf: vec![0u8; val_len_bytes],
66        }
67    }
68
69    /// K-mer length (number of bases).
70    pub fn k(&self) -> usize {
71        self.k
72    }
73
74    /// Read the next (k-mer, count) pair.
75    fn read_next(&mut self) -> Result<Option<(MerDna, u64)>> {
76        // Try to read the key
77        match self.reader.read_exact(&mut self.key_buf) {
78            Ok(()) => {}
79            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => return Ok(None),
80            Err(e) => return Err(Error::Io(e)),
81        }
82
83        // Read the value
84        self.reader.read_exact(&mut self.val_buf).map_err(|e| {
85            if e.kind() == std::io::ErrorKind::UnexpectedEof {
86                Error::UnexpectedEof
87            } else {
88                Error::Io(e)
89            }
90        })?;
91
92        let mer = MerDna::from_bytes(&self.key_buf, self.k);
93
94        // Parse value as little-endian unsigned integer
95        let mut count = 0u64;
96        for (i, &byte) in self.val_buf.iter().enumerate() {
97            count |= (byte as u64) << (i * 8);
98        }
99
100        Ok(Some((mer, count)))
101    }
102}
103
104impl<R: Read> Iterator for BinaryReader<R> {
105    type Item = Result<(MerDna, u64)>;
106
107    fn next(&mut self) -> Option<Self::Item> {
108        match self.read_next() {
109            Ok(Some(pair)) => Some(Ok(pair)),
110            Ok(None) => None,
111            Err(e) => Some(Err(e)),
112        }
113    }
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119    use std::io::Cursor;
120
121    /// Helper to create binary data for a k-mer with a given count.
122    fn encode_record(mer: &MerDna, count: u64, val_len: usize) -> Vec<u8> {
123        let key_bytes = (mer.k() * 2 + 7) / 8;
124        let mut buf = Vec::new();
125
126        // Write key bytes (little-endian from words)
127        let words = mer.words();
128        let mut bytes_written = 0;
129        for &word in words {
130            for byte_idx in 0..8 {
131                if bytes_written >= key_bytes {
132                    break;
133                }
134                buf.push((word >> (byte_idx * 8)) as u8);
135                bytes_written += 1;
136            }
137        }
138
139        // Write value bytes (little-endian)
140        for i in 0..val_len {
141            buf.push((count >> (i * 8)) as u8);
142        }
143
144        buf
145    }
146
147    #[test]
148    fn test_read_single_record() {
149        let mer: MerDna = "ACGT".parse().unwrap();
150        let count = 42u64;
151        let val_len = 4;
152        let data = encode_record(&mer, count, val_len);
153
154        let key_bytes = 1; // 4 bases * 2 bits = 8 bits = 1 byte
155        let reader = BinaryReader::with_params(Cursor::new(data), 4, key_bytes, val_len);
156
157        let results: Vec<_> = reader.collect();
158        assert_eq!(results.len(), 1);
159        let (read_mer, read_count) = results[0].as_ref().unwrap();
160        assert_eq!(read_mer.to_string(), "ACGT");
161        assert_eq!(*read_count, 42);
162    }
163
164    #[test]
165    fn test_read_multiple_records() {
166        let mers = vec![("AAAA", 10u64), ("ACGT", 42u64), ("TTTT", 100u64)];
167        let val_len = 4;
168        let key_bytes = 1;
169        let mut data = Vec::new();
170        for (seq, count) in &mers {
171            let mer: MerDna = seq.parse().unwrap();
172            data.extend(encode_record(&mer, *count, val_len));
173        }
174
175        let reader = BinaryReader::with_params(Cursor::new(data), 4, key_bytes, val_len);
176        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
177
178        assert_eq!(results.len(), 3);
179        assert_eq!(results[0].0.to_string(), "AAAA");
180        assert_eq!(results[0].1, 10);
181        assert_eq!(results[1].0.to_string(), "ACGT");
182        assert_eq!(results[1].1, 42);
183        assert_eq!(results[2].0.to_string(), "TTTT");
184        assert_eq!(results[2].1, 100);
185    }
186
187    #[test]
188    fn test_read_empty() {
189        let reader = BinaryReader::with_params(Cursor::new(Vec::new()), 4, 1, 4);
190        let results: Vec<_> = reader.collect();
191        assert_eq!(results.len(), 0);
192    }
193
194    #[test]
195    fn test_read_truncated_value() {
196        // Only key bytes, no value -> should return error
197        let mer: MerDna = "ACGT".parse().unwrap();
198        let mut data = Vec::new();
199        let words = mer.words();
200        data.push(words[0] as u8);
201        // No value bytes
202
203        let reader = BinaryReader::with_params(Cursor::new(data), 4, 1, 4);
204        let results: Vec<_> = reader.collect();
205        assert_eq!(results.len(), 1);
206        assert!(results[0].is_err());
207    }
208
209    #[test]
210    fn test_read_large_count() {
211        let mer: MerDna = "ACGT".parse().unwrap();
212        let count = u64::MAX;
213        let val_len = 8;
214        let data = encode_record(&mer, count, val_len);
215
216        let reader = BinaryReader::with_params(Cursor::new(data), 4, 1, val_len);
217        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
218        assert_eq!(results[0].1, u64::MAX);
219    }
220
221    #[test]
222    fn test_read_longer_kmer() {
223        // k=25 -> 50 bits -> 7 bytes
224        let seq = "ACGTACGTACGTACGTACGTACGTA"; // 25 bases
225        let mer: MerDna = seq.parse().unwrap();
226        let count = 99u64;
227        let val_len = 4;
228        let key_bytes = 7; // ceil(50/8)
229        let data = encode_record(&mer, count, val_len);
230
231        let reader = BinaryReader::with_params(Cursor::new(data), 25, key_bytes, val_len);
232        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
233        assert_eq!(results.len(), 1);
234        assert_eq!(results[0].0.to_string(), seq);
235        assert_eq!(results[0].1, 99);
236    }
237
238    #[test]
239    fn test_read_small_val_len() {
240        let mer: MerDna = "ACGT".parse().unwrap();
241        let count = 255u64; // max for 1 byte
242        let val_len = 1;
243        let data = encode_record(&mer, count, val_len);
244
245        let reader = BinaryReader::with_params(Cursor::new(data), 4, 1, val_len);
246        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
247        assert_eq!(results[0].1, 255);
248    }
249
250    #[test]
251    fn test_read_two_byte_count() {
252        let mer: MerDna = "ACGT".parse().unwrap();
253        let count = 1000u64;
254        let val_len = 2;
255        let data = encode_record(&mer, count, val_len);
256
257        let reader = BinaryReader::with_params(Cursor::new(data), 4, 1, val_len);
258        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
259        assert_eq!(results[0].1, 1000);
260    }
261}