Skip to main content

jellyfish_reader/
header.rs

1use std::io::Read;
2
3use crate::error::{Error, Result};
4use crate::matrix::RectangularBinaryMatrix;
5
6/// The format string for binary/sorted Jellyfish files.
7pub const FORMAT_BINARY_SORTED: &str = "binary/sorted";
8
9/// The format string for text/sorted Jellyfish files.
10pub const FORMAT_TEXT_SORTED: &str = "text/sorted";
11
12/// The format string for bloom counter Jellyfish files.
13pub const FORMAT_BLOOM_COUNTER: &str = "bloomcounter";
14
15/// Number of ASCII digits used for the header length prefix.
16const HEADER_LENGTH_DIGITS: usize = 9;
17
18/// Parsed Jellyfish file header.
19///
20/// Contains all metadata from the JSON header of a Jellyfish output file.
21/// The header includes k-mer parameters, hash table configuration, and
22/// provenance information.
23///
24/// # Examples
25///
26/// ```no_run
27/// use std::fs::File;
28/// use std::io::BufReader;
29/// use jellyfish_reader::FileHeader;
30///
31/// let file = File::open("output.jf").unwrap();
32/// let mut reader = BufReader::new(file);
33/// let header = FileHeader::read(&mut reader).unwrap();
34///
35/// println!("k = {}", header.k().unwrap());
36/// println!("format: {}", header.format());
37/// ```
38#[derive(Debug, Clone)]
39pub struct FileHeader {
40    /// The raw JSON root object.
41    raw: serde_json::Value,
42    /// Total header size including length prefix (byte offset to data).
43    offset: usize,
44}
45
46impl FileHeader {
47    /// Read and parse a Jellyfish file header from the given reader.
48    ///
49    /// This reads the 9-digit length prefix, then the JSON header content.
50    /// After this call, the reader is positioned at the start of the data section.
51    pub fn read<R: Read>(reader: &mut R) -> Result<Self> {
52        // Read the 9-digit decimal length prefix
53        let mut len_buf = [0u8; HEADER_LENGTH_DIGITS];
54        reader
55            .read_exact(&mut len_buf)
56            .map_err(|_| Error::InvalidHeader("could not read header length prefix".to_string()))?;
57
58        let len_str = std::str::from_utf8(&len_buf)
59            .map_err(|_| Error::InvalidHeader("header length is not valid ASCII".to_string()))?;
60
61        let header_len: usize = len_str
62            .trim()
63            .parse()
64            .map_err(|_| Error::InvalidHeader(format!("invalid header length: {len_str:?}")))?;
65
66        // Read the JSON content
67        let mut json_buf = vec![0u8; header_len];
68        reader
69            .read_exact(&mut json_buf)
70            .map_err(|_| Error::InvalidHeader("could not read header JSON content".to_string()))?;
71
72        // Strip trailing null bytes (alignment padding)
73        let json_end = json_buf
74            .iter()
75            .position(|&b| b == 0)
76            .unwrap_or(json_buf.len());
77        let json_str = std::str::from_utf8(&json_buf[..json_end])
78            .map_err(|_| Error::InvalidHeader("header JSON is not valid UTF-8".to_string()))?;
79
80        let raw: serde_json::Value = serde_json::from_str(json_str)?;
81
82        let offset = HEADER_LENGTH_DIGITS + header_len;
83
84        Ok(Self { raw, offset })
85    }
86
87    /// Byte offset from the start of the file to the data section.
88    pub fn offset(&self) -> usize {
89        self.offset
90    }
91
92    /// The file format string (e.g., "binary/sorted", "text/sorted").
93    pub fn format(&self) -> &str {
94        self.raw
95            .get("format")
96            .and_then(|v| v.as_str())
97            .unwrap_or("")
98    }
99
100    /// Whether this is a binary/sorted format file.
101    pub fn is_binary(&self) -> bool {
102        self.format() == FORMAT_BINARY_SORTED
103    }
104
105    /// Whether this is a text/sorted format file.
106    pub fn is_text(&self) -> bool {
107        self.format() == FORMAT_TEXT_SORTED
108    }
109
110    /// Hash table size (number of slots).
111    pub fn size(&self) -> Option<u64> {
112        self.raw.get("size").and_then(|v| v.as_u64())
113    }
114
115    /// K-mer length in bits (= 2 * k). Divide by 2 to get k.
116    pub fn key_len(&self) -> Option<u64> {
117        self.raw.get("key_len").and_then(|v| v.as_u64())
118    }
119
120    /// K-mer length (number of bases).
121    pub fn k(&self) -> Option<usize> {
122        self.key_len().map(|kl| (kl / 2) as usize)
123    }
124
125    /// Counter value length in bytes.
126    pub fn val_len(&self) -> Option<u64> {
127        self.raw.get("val_len").and_then(|v| v.as_u64())
128    }
129
130    /// Counter length in the data section (may differ from val_len).
131    pub fn counter_len(&self) -> Option<u64> {
132        self.raw.get("counter_len").and_then(|v| v.as_u64())
133    }
134
135    /// Maximum reprobing distance.
136    pub fn max_reprobe(&self) -> Option<u64> {
137        self.raw.get("max_reprobe").and_then(|v| v.as_u64())
138    }
139
140    /// Whether canonical k-mers were used.
141    pub fn canonical(&self) -> bool {
142        self.raw
143            .get("canonical")
144            .and_then(|v| v.as_bool())
145            .unwrap_or(false)
146    }
147
148    /// Number of hash functions (for bloom counter format).
149    pub fn nb_hashes(&self) -> Option<u64> {
150        self.raw.get("nb_hashes").and_then(|v| v.as_u64())
151    }
152
153    /// False positive rate (for bloom counter format).
154    pub fn fpr(&self) -> Option<f64> {
155        self.raw.get("fpr").and_then(|v| v.as_f64())
156    }
157
158    /// The hostname of the machine that created this file.
159    pub fn hostname(&self) -> Option<&str> {
160        self.raw.get("hostname").and_then(|v| v.as_str())
161    }
162
163    /// The working directory when the file was created.
164    pub fn pwd(&self) -> Option<&str> {
165        self.raw.get("pwd").and_then(|v| v.as_str())
166    }
167
168    /// The timestamp when the file was created.
169    pub fn time(&self) -> Option<&str> {
170        self.raw.get("time").and_then(|v| v.as_str())
171    }
172
173    /// The path to the jellyfish executable that created this file.
174    pub fn exe_path(&self) -> Option<&str> {
175        self.raw.get("exe_path").and_then(|v| v.as_str())
176    }
177
178    /// The command line arguments used to create this file.
179    pub fn cmdline(&self) -> Option<Vec<&str>> {
180        self.raw.get("cmdline").and_then(|v| {
181            v.as_array()
182                .map(|arr| arr.iter().filter_map(|s| s.as_str()).collect())
183        })
184    }
185
186    /// Parse the hash matrix at the given index from the header.
187    pub fn matrix(&self, index: usize) -> Result<RectangularBinaryMatrix> {
188        let key = format!("matrix{index}");
189        match self.raw.get(&key) {
190            Some(v) => RectangularBinaryMatrix::from_json(v),
191            None => {
192                // Try without index for the default matrix
193                if index == 0 {
194                    match self.raw.get("matrix") {
195                        Some(v) => RectangularBinaryMatrix::from_json(v),
196                        None => Ok(RectangularBinaryMatrix::identity(64)),
197                    }
198                } else {
199                    Err(Error::MissingField(key))
200                }
201            }
202        }
203    }
204
205    /// Get the reprobes array from the header.
206    pub fn reprobes(&self) -> Option<Vec<u64>> {
207        self.raw.get("reprobes").and_then(|v| {
208            v.as_array().map(|arr| {
209                arr.iter()
210                    .filter_map(|v| v.as_u64().or_else(|| v.as_i64().map(|i| i as u64)))
211                    .collect()
212            })
213        })
214    }
215
216    /// Access the raw JSON header value.
217    pub fn raw_json(&self) -> &serde_json::Value {
218        &self.raw
219    }
220
221    /// Compute key length in bytes for binary format.
222    pub fn key_bytes(&self) -> Option<usize> {
223        self.key_len().map(|bits| {
224            let bits = bits as usize;
225            bits.div_ceil(8)
226        })
227    }
228
229    /// Compute the counter length in bytes for the data section.
230    pub fn data_val_len(&self) -> Option<usize> {
231        self.counter_len()
232            .or_else(|| self.val_len())
233            .map(|v| v as usize)
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240    use std::io::Cursor;
241
242    /// Helper to create a fake Jellyfish header from a JSON value.
243    fn make_header_bytes(json: &serde_json::Value) -> Vec<u8> {
244        let json_str = serde_json::to_string(json).unwrap();
245        let len = json_str.len();
246        let mut buf = Vec::new();
247        buf.extend_from_slice(format!("{:09}", len).as_bytes());
248        buf.extend_from_slice(json_str.as_bytes());
249        buf
250    }
251
252    #[test]
253    fn test_parse_minimal_header() {
254        let json = serde_json::json!({
255            "format": "binary/sorted",
256            "key_len": 50,
257            "val_len": 4,
258            "size": 1024,
259            "canonical": true
260        });
261        let bytes = make_header_bytes(&json);
262        let mut cursor = Cursor::new(bytes);
263        let header = FileHeader::read(&mut cursor).unwrap();
264
265        assert_eq!(header.format(), "binary/sorted");
266        assert!(header.is_binary());
267        assert!(!header.is_text());
268        assert_eq!(header.key_len(), Some(50));
269        assert_eq!(header.k(), Some(25));
270        assert_eq!(header.val_len(), Some(4));
271        assert_eq!(header.size(), Some(1024));
272        assert!(header.canonical());
273    }
274
275    #[test]
276    fn test_parse_text_format_header() {
277        let json = serde_json::json!({
278            "format": "text/sorted",
279            "key_len": 42,
280            "counter_len": 8
281        });
282        let bytes = make_header_bytes(&json);
283        let mut cursor = Cursor::new(bytes);
284        let header = FileHeader::read(&mut cursor).unwrap();
285
286        assert!(header.is_text());
287        assert!(!header.is_binary());
288        assert_eq!(header.k(), Some(21));
289    }
290
291    #[test]
292    fn test_header_offset() {
293        let json = serde_json::json!({"format": "binary/sorted"});
294        let json_str = serde_json::to_string(&json).unwrap();
295        let expected_offset = 9 + json_str.len();
296
297        let bytes = make_header_bytes(&json);
298        let mut cursor = Cursor::new(bytes);
299        let header = FileHeader::read(&mut cursor).unwrap();
300
301        assert_eq!(header.offset(), expected_offset);
302    }
303
304    #[test]
305    fn test_header_with_padding() {
306        let json = serde_json::json!({"format": "binary/sorted", "key_len": 50});
307        let json_str = serde_json::to_string(&json).unwrap();
308        let padded_len = json_str.len() + 10; // add null padding
309        let mut buf = Vec::new();
310        buf.extend_from_slice(format!("{:09}", padded_len).as_bytes());
311        buf.extend_from_slice(json_str.as_bytes());
312        buf.extend_from_slice(&vec![0u8; 10]); // null padding
313
314        let mut cursor = Cursor::new(buf);
315        let header = FileHeader::read(&mut cursor).unwrap();
316        assert_eq!(header.format(), "binary/sorted");
317        assert_eq!(header.key_len(), Some(50));
318    }
319
320    #[test]
321    fn test_header_metadata_fields() {
322        let json = serde_json::json!({
323            "format": "binary/sorted",
324            "hostname": "testhost",
325            "pwd": "/tmp",
326            "time": "2024-01-01",
327            "exe_path": "/usr/bin/jellyfish",
328            "cmdline": ["jellyfish", "count", "-m", "25", "input.fa"],
329            "key_len": 50,
330            "val_len": 4,
331            "counter_len": 4,
332            "max_reprobe": 126,
333            "canonical": false
334        });
335        let bytes = make_header_bytes(&json);
336        let mut cursor = Cursor::new(bytes);
337        let header = FileHeader::read(&mut cursor).unwrap();
338
339        assert_eq!(header.hostname(), Some("testhost"));
340        assert_eq!(header.pwd(), Some("/tmp"));
341        assert_eq!(header.time(), Some("2024-01-01"));
342        assert_eq!(header.exe_path(), Some("/usr/bin/jellyfish"));
343        assert_eq!(
344            header.cmdline(),
345            Some(vec!["jellyfish", "count", "-m", "25", "input.fa"])
346        );
347        assert_eq!(header.max_reprobe(), Some(126));
348        assert!(!header.canonical());
349    }
350
351    #[test]
352    fn test_header_key_bytes() {
353        let json = serde_json::json!({"format": "binary/sorted", "key_len": 50});
354        let bytes = make_header_bytes(&json);
355        let mut cursor = Cursor::new(bytes);
356        let header = FileHeader::read(&mut cursor).unwrap();
357        // 50 bits -> ceil(50/8) = 7 bytes
358        assert_eq!(header.key_bytes(), Some(7));
359    }
360
361    #[test]
362    fn test_header_key_bytes_aligned() {
363        let json = serde_json::json!({"format": "binary/sorted", "key_len": 64});
364        let bytes = make_header_bytes(&json);
365        let mut cursor = Cursor::new(bytes);
366        let header = FileHeader::read(&mut cursor).unwrap();
367        // 64 bits -> 8 bytes exactly
368        assert_eq!(header.key_bytes(), Some(8));
369    }
370
371    #[test]
372    fn test_invalid_header_too_short() {
373        let mut cursor = Cursor::new(b"123");
374        assert!(FileHeader::read(&mut cursor).is_err());
375    }
376
377    #[test]
378    fn test_invalid_header_bad_length() {
379        let mut cursor = Cursor::new(b"not_a_num");
380        assert!(FileHeader::read(&mut cursor).is_err());
381    }
382
383    #[test]
384    fn test_invalid_header_bad_json() {
385        let mut buf = Vec::new();
386        let bad_json = b"not json!";
387        buf.extend_from_slice(format!("{:09}", bad_json.len()).as_bytes());
388        buf.extend_from_slice(bad_json);
389        let mut cursor = Cursor::new(buf);
390        assert!(FileHeader::read(&mut cursor).is_err());
391    }
392
393    #[test]
394    fn test_missing_optional_fields() {
395        let json = serde_json::json!({"format": "binary/sorted"});
396        let bytes = make_header_bytes(&json);
397        let mut cursor = Cursor::new(bytes);
398        let header = FileHeader::read(&mut cursor).unwrap();
399
400        assert_eq!(header.size(), None);
401        assert_eq!(header.key_len(), None);
402        assert_eq!(header.val_len(), None);
403        assert_eq!(header.hostname(), None);
404        assert_eq!(header.pwd(), None);
405        assert_eq!(header.time(), None);
406        assert_eq!(header.exe_path(), None);
407        assert_eq!(header.cmdline(), None);
408        assert!(!header.canonical()); // defaults to false
409    }
410
411    #[test]
412    fn test_data_val_len_prefers_counter_len() {
413        let json = serde_json::json!({
414            "format": "binary/sorted",
415            "val_len": 4,
416            "counter_len": 8
417        });
418        let bytes = make_header_bytes(&json);
419        let mut cursor = Cursor::new(bytes);
420        let header = FileHeader::read(&mut cursor).unwrap();
421        assert_eq!(header.data_val_len(), Some(8)); // counter_len takes priority
422    }
423
424    #[test]
425    fn test_data_val_len_fallback_to_val_len() {
426        let json = serde_json::json!({
427            "format": "binary/sorted",
428            "val_len": 4
429        });
430        let bytes = make_header_bytes(&json);
431        let mut cursor = Cursor::new(bytes);
432        let header = FileHeader::read(&mut cursor).unwrap();
433        assert_eq!(header.data_val_len(), Some(4));
434    }
435}