jellyfish_reader/
header.rs1use std::io::Read;
2
3use crate::error::{Error, Result};
4use crate::matrix::RectangularBinaryMatrix;
5
6pub const FORMAT_BINARY_SORTED: &str = "binary/sorted";
8
9pub const FORMAT_TEXT_SORTED: &str = "text/sorted";
11
12pub const FORMAT_BLOOM_COUNTER: &str = "bloomcounter";
14
15const HEADER_LENGTH_DIGITS: usize = 9;
17
18#[derive(Debug, Clone)]
39pub struct FileHeader {
40 raw: serde_json::Value,
42 offset: usize,
44}
45
46impl FileHeader {
47 pub fn read<R: Read>(reader: &mut R) -> Result<Self> {
52 let mut len_buf = [0u8; HEADER_LENGTH_DIGITS];
54 reader
55 .read_exact(&mut len_buf)
56 .map_err(|_| Error::InvalidHeader("could not read header length prefix".to_string()))?;
57
58 let len_str = std::str::from_utf8(&len_buf)
59 .map_err(|_| Error::InvalidHeader("header length is not valid ASCII".to_string()))?;
60
61 let header_len: usize = len_str
62 .trim()
63 .parse()
64 .map_err(|_| Error::InvalidHeader(format!("invalid header length: {len_str:?}")))?;
65
66 let mut json_buf = vec![0u8; header_len];
68 reader
69 .read_exact(&mut json_buf)
70 .map_err(|_| Error::InvalidHeader("could not read header JSON content".to_string()))?;
71
72 let json_end = json_buf
74 .iter()
75 .position(|&b| b == 0)
76 .unwrap_or(json_buf.len());
77 let json_str = std::str::from_utf8(&json_buf[..json_end])
78 .map_err(|_| Error::InvalidHeader("header JSON is not valid UTF-8".to_string()))?;
79
80 let raw: serde_json::Value = serde_json::from_str(json_str)?;
81
82 let offset = HEADER_LENGTH_DIGITS + header_len;
83
84 Ok(Self { raw, offset })
85 }
86
87 pub fn offset(&self) -> usize {
89 self.offset
90 }
91
92 pub fn format(&self) -> &str {
94 self.raw
95 .get("format")
96 .and_then(|v| v.as_str())
97 .unwrap_or("")
98 }
99
100 pub fn is_binary(&self) -> bool {
102 self.format() == FORMAT_BINARY_SORTED
103 }
104
105 pub fn is_text(&self) -> bool {
107 self.format() == FORMAT_TEXT_SORTED
108 }
109
110 pub fn size(&self) -> Option<u64> {
112 self.raw.get("size").and_then(|v| v.as_u64())
113 }
114
115 pub fn key_len(&self) -> Option<u64> {
117 self.raw.get("key_len").and_then(|v| v.as_u64())
118 }
119
120 pub fn k(&self) -> Option<usize> {
122 self.key_len().map(|kl| (kl / 2) as usize)
123 }
124
125 pub fn val_len(&self) -> Option<u64> {
127 self.raw.get("val_len").and_then(|v| v.as_u64())
128 }
129
130 pub fn counter_len(&self) -> Option<u64> {
132 self.raw.get("counter_len").and_then(|v| v.as_u64())
133 }
134
135 pub fn max_reprobe(&self) -> Option<u64> {
137 self.raw.get("max_reprobe").and_then(|v| v.as_u64())
138 }
139
140 pub fn canonical(&self) -> bool {
142 self.raw
143 .get("canonical")
144 .and_then(|v| v.as_bool())
145 .unwrap_or(false)
146 }
147
148 pub fn nb_hashes(&self) -> Option<u64> {
150 self.raw.get("nb_hashes").and_then(|v| v.as_u64())
151 }
152
153 pub fn fpr(&self) -> Option<f64> {
155 self.raw.get("fpr").and_then(|v| v.as_f64())
156 }
157
158 pub fn hostname(&self) -> Option<&str> {
160 self.raw.get("hostname").and_then(|v| v.as_str())
161 }
162
163 pub fn pwd(&self) -> Option<&str> {
165 self.raw.get("pwd").and_then(|v| v.as_str())
166 }
167
168 pub fn time(&self) -> Option<&str> {
170 self.raw.get("time").and_then(|v| v.as_str())
171 }
172
173 pub fn exe_path(&self) -> Option<&str> {
175 self.raw.get("exe_path").and_then(|v| v.as_str())
176 }
177
178 pub fn cmdline(&self) -> Option<Vec<&str>> {
180 self.raw.get("cmdline").and_then(|v| {
181 v.as_array()
182 .map(|arr| arr.iter().filter_map(|s| s.as_str()).collect())
183 })
184 }
185
186 pub fn matrix(&self, index: usize) -> Result<RectangularBinaryMatrix> {
188 let key = format!("matrix{index}");
189 match self.raw.get(&key) {
190 Some(v) => RectangularBinaryMatrix::from_json(v),
191 None => {
192 if index == 0 {
194 match self.raw.get("matrix") {
195 Some(v) => RectangularBinaryMatrix::from_json(v),
196 None => Ok(RectangularBinaryMatrix::identity(64)),
197 }
198 } else {
199 Err(Error::MissingField(key))
200 }
201 }
202 }
203 }
204
205 pub fn reprobes(&self) -> Option<Vec<u64>> {
207 self.raw.get("reprobes").and_then(|v| {
208 v.as_array().map(|arr| {
209 arr.iter()
210 .filter_map(|v| v.as_u64().or_else(|| v.as_i64().map(|i| i as u64)))
211 .collect()
212 })
213 })
214 }
215
216 pub fn raw_json(&self) -> &serde_json::Value {
218 &self.raw
219 }
220
221 pub fn key_bytes(&self) -> Option<usize> {
223 self.key_len().map(|bits| {
224 let bits = bits as usize;
225 bits.div_ceil(8)
226 })
227 }
228
229 pub fn data_val_len(&self) -> Option<usize> {
231 self.counter_len()
232 .or_else(|| self.val_len())
233 .map(|v| v as usize)
234 }
235}
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240 use std::io::Cursor;
241
242 fn make_header_bytes(json: &serde_json::Value) -> Vec<u8> {
244 let json_str = serde_json::to_string(json).unwrap();
245 let len = json_str.len();
246 let mut buf = Vec::new();
247 buf.extend_from_slice(format!("{:09}", len).as_bytes());
248 buf.extend_from_slice(json_str.as_bytes());
249 buf
250 }
251
252 #[test]
253 fn test_parse_minimal_header() {
254 let json = serde_json::json!({
255 "format": "binary/sorted",
256 "key_len": 50,
257 "val_len": 4,
258 "size": 1024,
259 "canonical": true
260 });
261 let bytes = make_header_bytes(&json);
262 let mut cursor = Cursor::new(bytes);
263 let header = FileHeader::read(&mut cursor).unwrap();
264
265 assert_eq!(header.format(), "binary/sorted");
266 assert!(header.is_binary());
267 assert!(!header.is_text());
268 assert_eq!(header.key_len(), Some(50));
269 assert_eq!(header.k(), Some(25));
270 assert_eq!(header.val_len(), Some(4));
271 assert_eq!(header.size(), Some(1024));
272 assert!(header.canonical());
273 }
274
275 #[test]
276 fn test_parse_text_format_header() {
277 let json = serde_json::json!({
278 "format": "text/sorted",
279 "key_len": 42,
280 "counter_len": 8
281 });
282 let bytes = make_header_bytes(&json);
283 let mut cursor = Cursor::new(bytes);
284 let header = FileHeader::read(&mut cursor).unwrap();
285
286 assert!(header.is_text());
287 assert!(!header.is_binary());
288 assert_eq!(header.k(), Some(21));
289 }
290
291 #[test]
292 fn test_header_offset() {
293 let json = serde_json::json!({"format": "binary/sorted"});
294 let json_str = serde_json::to_string(&json).unwrap();
295 let expected_offset = 9 + json_str.len();
296
297 let bytes = make_header_bytes(&json);
298 let mut cursor = Cursor::new(bytes);
299 let header = FileHeader::read(&mut cursor).unwrap();
300
301 assert_eq!(header.offset(), expected_offset);
302 }
303
304 #[test]
305 fn test_header_with_padding() {
306 let json = serde_json::json!({"format": "binary/sorted", "key_len": 50});
307 let json_str = serde_json::to_string(&json).unwrap();
308 let padded_len = json_str.len() + 10; let mut buf = Vec::new();
310 buf.extend_from_slice(format!("{:09}", padded_len).as_bytes());
311 buf.extend_from_slice(json_str.as_bytes());
312 buf.extend_from_slice(&vec![0u8; 10]); let mut cursor = Cursor::new(buf);
315 let header = FileHeader::read(&mut cursor).unwrap();
316 assert_eq!(header.format(), "binary/sorted");
317 assert_eq!(header.key_len(), Some(50));
318 }
319
320 #[test]
321 fn test_header_metadata_fields() {
322 let json = serde_json::json!({
323 "format": "binary/sorted",
324 "hostname": "testhost",
325 "pwd": "/tmp",
326 "time": "2024-01-01",
327 "exe_path": "/usr/bin/jellyfish",
328 "cmdline": ["jellyfish", "count", "-m", "25", "input.fa"],
329 "key_len": 50,
330 "val_len": 4,
331 "counter_len": 4,
332 "max_reprobe": 126,
333 "canonical": false
334 });
335 let bytes = make_header_bytes(&json);
336 let mut cursor = Cursor::new(bytes);
337 let header = FileHeader::read(&mut cursor).unwrap();
338
339 assert_eq!(header.hostname(), Some("testhost"));
340 assert_eq!(header.pwd(), Some("/tmp"));
341 assert_eq!(header.time(), Some("2024-01-01"));
342 assert_eq!(header.exe_path(), Some("/usr/bin/jellyfish"));
343 assert_eq!(
344 header.cmdline(),
345 Some(vec!["jellyfish", "count", "-m", "25", "input.fa"])
346 );
347 assert_eq!(header.max_reprobe(), Some(126));
348 assert!(!header.canonical());
349 }
350
351 #[test]
352 fn test_header_key_bytes() {
353 let json = serde_json::json!({"format": "binary/sorted", "key_len": 50});
354 let bytes = make_header_bytes(&json);
355 let mut cursor = Cursor::new(bytes);
356 let header = FileHeader::read(&mut cursor).unwrap();
357 assert_eq!(header.key_bytes(), Some(7));
359 }
360
361 #[test]
362 fn test_header_key_bytes_aligned() {
363 let json = serde_json::json!({"format": "binary/sorted", "key_len": 64});
364 let bytes = make_header_bytes(&json);
365 let mut cursor = Cursor::new(bytes);
366 let header = FileHeader::read(&mut cursor).unwrap();
367 assert_eq!(header.key_bytes(), Some(8));
369 }
370
371 #[test]
372 fn test_invalid_header_too_short() {
373 let mut cursor = Cursor::new(b"123");
374 assert!(FileHeader::read(&mut cursor).is_err());
375 }
376
377 #[test]
378 fn test_invalid_header_bad_length() {
379 let mut cursor = Cursor::new(b"not_a_num");
380 assert!(FileHeader::read(&mut cursor).is_err());
381 }
382
383 #[test]
384 fn test_invalid_header_bad_json() {
385 let mut buf = Vec::new();
386 let bad_json = b"not json!";
387 buf.extend_from_slice(format!("{:09}", bad_json.len()).as_bytes());
388 buf.extend_from_slice(bad_json);
389 let mut cursor = Cursor::new(buf);
390 assert!(FileHeader::read(&mut cursor).is_err());
391 }
392
393 #[test]
394 fn test_missing_optional_fields() {
395 let json = serde_json::json!({"format": "binary/sorted"});
396 let bytes = make_header_bytes(&json);
397 let mut cursor = Cursor::new(bytes);
398 let header = FileHeader::read(&mut cursor).unwrap();
399
400 assert_eq!(header.size(), None);
401 assert_eq!(header.key_len(), None);
402 assert_eq!(header.val_len(), None);
403 assert_eq!(header.hostname(), None);
404 assert_eq!(header.pwd(), None);
405 assert_eq!(header.time(), None);
406 assert_eq!(header.exe_path(), None);
407 assert_eq!(header.cmdline(), None);
408 assert!(!header.canonical()); }
410
411 #[test]
412 fn test_data_val_len_prefers_counter_len() {
413 let json = serde_json::json!({
414 "format": "binary/sorted",
415 "val_len": 4,
416 "counter_len": 8
417 });
418 let bytes = make_header_bytes(&json);
419 let mut cursor = Cursor::new(bytes);
420 let header = FileHeader::read(&mut cursor).unwrap();
421 assert_eq!(header.data_val_len(), Some(8)); }
423
424 #[test]
425 fn test_data_val_len_fallback_to_val_len() {
426 let json = serde_json::json!({
427 "format": "binary/sorted",
428 "val_len": 4
429 });
430 let bytes = make_header_bytes(&json);
431 let mut cursor = Cursor::new(bytes);
432 let header = FileHeader::read(&mut cursor).unwrap();
433 assert_eq!(header.data_val_len(), Some(4));
434 }
435}