Skip to main content

jellyfish_reader/
lib.rs

1//! # jellyfish-reader
2//!
3//! A pure Rust library for reading [Jellyfish](https://github.com/gmarcais/Jellyfish)
4//! k-mer counting output files.
5//!
6//! Jellyfish is a fast, memory-efficient tool for counting k-mers in DNA sequences,
7//! widely used in bioinformatics. This crate provides native Rust readers for
8//! Jellyfish's binary and text output formats, with no C/C++ dependencies.
9//!
10//! ## Features
11//!
12//! - **Sequential reading** of binary/sorted and text/sorted Jellyfish files
13//! - **Random-access queries** via memory-mapped I/O with binary search
14//! - **K-mer representation** (`MerDna`) with canonical form, reverse complement,
15//!   and all standard operations
16//! - **String k-mer extraction** matching Jellyfish's `StringMers` interface
17//! - **Auto-format detection** from file headers
18//!
19//! ## Quick Start
20//!
21//! ```no_run
22//! use jellyfish_reader::{ReadMerFile, MerDna, QueryMerFile};
23//!
24//! // Sequential reading
25//! let reader = ReadMerFile::open("output.jf").unwrap();
26//! for result in reader {
27//!     let (mer, count) = result.unwrap();
28//!     println!("{}: {}", mer, count);
29//! }
30//!
31//! // Random access
32//! let qf = QueryMerFile::open("output.jf").unwrap();
33//! let mer: MerDna = "ACGTACGTACGTACGTACGTACGTA".parse().unwrap();
34//! if let Some(count) = qf.get(&mer) {
35//!     println!("Count: {}", count);
36//! }
37//! ```
38//!
39//! ## K-mer Operations
40//!
41//! ```
42//! use jellyfish_reader::MerDna;
43//!
44//! let mer: MerDna = "ACGT".parse().unwrap();
45//!
46//! // Reverse complement
47//! let rc = mer.get_reverse_complement();
48//! assert_eq!(rc.to_string(), "ACGT"); // ACGT is a palindrome
49//!
50//! // Canonical form (lexicographically smaller of self and RC)
51//! let canonical = mer.get_canonical();
52//!
53//! // Extract k-mers from a sequence
54//! use jellyfish_reader::StringMers;
55//! let kmers: Vec<_> = StringMers::new("ACGTACGT", 4)
56//!     .map(|m| m.to_string())
57//!     .collect();
58//! assert_eq!(kmers, vec!["ACGT", "CGTA", "GTAC", "TACG", "ACGT"]);
59//! ```
60
61pub mod binary;
62pub mod error;
63pub mod header;
64pub mod matrix;
65pub mod mer;
66pub mod query;
67pub mod string_mers;
68pub mod text;
69
70// Re-exports for convenience
71pub use binary::BinaryReader;
72pub use error::{Error, Result};
73pub use header::FileHeader;
74pub use matrix::RectangularBinaryMatrix;
75pub use mer::MerDna;
76pub use query::QueryMerFile;
77pub use string_mers::{StringMers, string_canonicals, string_mers};
78pub use text::TextReader;
79
80use std::fs::File;
81use std::io::BufReader;
82use std::path::Path;
83
84/// Unified sequential reader for Jellyfish output files.
85///
86/// Automatically detects the file format (binary/sorted or text/sorted)
87/// from the header and provides a single iterator interface.
88///
89/// # Examples
90///
91/// ```no_run
92/// use jellyfish_reader::ReadMerFile;
93///
94/// let reader = ReadMerFile::open("output.jf").unwrap();
95/// for result in reader {
96///     let (mer, count) = result.unwrap();
97///     println!("{}: {}", mer, count);
98/// }
99/// ```
100pub enum ReadMerFile {
101    /// Binary format reader.
102    Binary(BinaryReader<BufReader<File>>),
103    /// Text format reader.
104    Text(TextReader<BufReader<File>>),
105}
106
107impl ReadMerFile {
108    /// Open a Jellyfish file for sequential reading.
109    ///
110    /// The format is auto-detected from the file header.
111    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
112        let file = File::open(path.as_ref())?;
113        let mut reader = BufReader::new(file);
114        let header = FileHeader::read(&mut reader)?;
115
116        match header.format() {
117            header::FORMAT_BINARY_SORTED => {
118                let binary = BinaryReader::new(reader, &header)?;
119                Ok(ReadMerFile::Binary(binary))
120            }
121            header::FORMAT_TEXT_SORTED => {
122                let text = TextReader::new(reader);
123                Ok(ReadMerFile::Text(text))
124            }
125            other => Err(Error::UnsupportedFormat(other.to_string())),
126        }
127    }
128}
129
130impl Iterator for ReadMerFile {
131    type Item = Result<(MerDna, u64)>;
132
133    fn next(&mut self) -> Option<Self::Item> {
134        match self {
135            ReadMerFile::Binary(r) => r.next(),
136            ReadMerFile::Text(r) => r.next(),
137        }
138    }
139}
140
141#[cfg(test)]
142mod integration_tests {
143    use super::*;
144    use std::io::Write;
145    use tempfile::NamedTempFile;
146
147    /// Create a test binary/sorted Jellyfish file.
148    fn create_binary_jf(k: usize, val_len: usize, records: &[(&str, u64)]) -> NamedTempFile {
149        let key_len_bits = k * 2;
150        let key_len_bytes = (key_len_bits + 7) / 8;
151
152        let mut sorted_records: Vec<(MerDna, u64)> = records
153            .iter()
154            .map(|(s, c)| (s.parse::<MerDna>().unwrap(), *c))
155            .collect();
156        sorted_records.sort_by(|a, b| a.0.cmp(&b.0));
157
158        let size = sorted_records.len().next_power_of_two().max(2);
159        let json = serde_json::json!({
160            "format": "binary/sorted",
161            "key_len": key_len_bits,
162            "val_len": val_len,
163            "counter_len": val_len,
164            "size": size,
165            "canonical": false
166        });
167        let json_str = serde_json::to_string(&json).unwrap();
168
169        let mut file = NamedTempFile::new().unwrap();
170        write!(file, "{:09}", json_str.len()).unwrap();
171        file.write_all(json_str.as_bytes()).unwrap();
172
173        for (mer, count) in &sorted_records {
174            let words = mer.words();
175            let mut bytes_written = 0;
176            for &word in words {
177                for byte_idx in 0..8 {
178                    if bytes_written >= key_len_bytes {
179                        break;
180                    }
181                    file.write_all(&[(word >> (byte_idx * 8)) as u8]).unwrap();
182                    bytes_written += 1;
183                }
184            }
185            for i in 0..val_len {
186                file.write_all(&[(count >> (i * 8)) as u8]).unwrap();
187            }
188        }
189        file.flush().unwrap();
190        file
191    }
192
193    /// Create a test text/sorted Jellyfish file.
194    fn create_text_jf(k: usize, records: &[(&str, u64)]) -> NamedTempFile {
195        let json = serde_json::json!({
196            "format": "text/sorted",
197            "key_len": k * 2,
198            "counter_len": 8
199        });
200        let json_str = serde_json::to_string(&json).unwrap();
201
202        let mut file = NamedTempFile::new().unwrap();
203        write!(file, "{:09}", json_str.len()).unwrap();
204        file.write_all(json_str.as_bytes()).unwrap();
205
206        for (seq, count) in records {
207            writeln!(file, "{seq} {count}").unwrap();
208        }
209        file.flush().unwrap();
210        file
211    }
212
213    #[test]
214    fn test_read_binary_auto_detect() {
215        let file = create_binary_jf(4, 4, &[("ACGT", 42), ("AAAA", 10)]);
216        let reader = ReadMerFile::open(file.path()).unwrap();
217
218        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
219        assert_eq!(results.len(), 2);
220
221        let strs: Vec<String> = results.iter().map(|(m, _)| m.to_string()).collect();
222        let counts: Vec<u64> = results.iter().map(|(_, c)| *c).collect();
223
224        assert!(strs.contains(&"ACGT".to_string()));
225        assert!(strs.contains(&"AAAA".to_string()));
226
227        let acgt_idx = strs.iter().position(|s| s == "ACGT").unwrap();
228        let aaaa_idx = strs.iter().position(|s| s == "AAAA").unwrap();
229        assert_eq!(counts[acgt_idx], 42);
230        assert_eq!(counts[aaaa_idx], 10);
231    }
232
233    #[test]
234    fn test_read_text_auto_detect() {
235        let file = create_text_jf(4, &[("ACGT", 42), ("AAAA", 10)]);
236        let reader = ReadMerFile::open(file.path()).unwrap();
237
238        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
239        assert_eq!(results.len(), 2);
240        assert_eq!(results[0].0.to_string(), "ACGT");
241        assert_eq!(results[0].1, 42);
242        assert_eq!(results[1].0.to_string(), "AAAA");
243        assert_eq!(results[1].1, 10);
244    }
245
246    #[test]
247    fn test_binary_and_query_consistent() {
248        let records = [
249            ("AAAA", 10u64),
250            ("ACGT", 42),
251            ("CCCC", 7),
252            ("GGGG", 33),
253            ("TTTT", 100),
254        ];
255        let file = create_binary_jf(4, 4, &records);
256
257        let reader = ReadMerFile::open(file.path()).unwrap();
258        let seq_results: Vec<_> = reader.map(|r| r.unwrap()).collect();
259
260        let qf = QueryMerFile::open(file.path()).unwrap();
261
262        for (mer, count) in &seq_results {
263            let query_count = qf.get(mer);
264            assert_eq!(
265                query_count,
266                Some(*count),
267                "mismatch for {}: sequential={}, query={:?}",
268                mer,
269                count,
270                query_count
271            );
272        }
273    }
274
275    #[test]
276    fn test_roundtrip_various_k() {
277        for k in [1, 2, 4, 8, 16, 25, 31, 32, 33] {
278            let seq: String = "ACGT".chars().cycle().take(k).collect();
279            let file = create_binary_jf(k, 4, &[(&seq, 1)]);
280            let reader = ReadMerFile::open(file.path()).unwrap();
281            let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
282            assert_eq!(results.len(), 1, "k={k}");
283            assert_eq!(results[0].0.to_string(), seq, "k={k}");
284            assert_eq!(results[0].1, 1, "k={k}");
285        }
286    }
287
288    #[test]
289    fn test_empty_binary_file() {
290        let file = create_binary_jf(4, 4, &[]);
291        let reader = ReadMerFile::open(file.path()).unwrap();
292        let results: Vec<_> = reader.collect();
293        assert_eq!(results.len(), 0);
294    }
295
296    #[test]
297    fn test_empty_text_file() {
298        let file = create_text_jf(4, &[]);
299        let reader = ReadMerFile::open(file.path()).unwrap();
300        let results: Vec<_> = reader.collect();
301        assert_eq!(results.len(), 0);
302    }
303
304    #[test]
305    fn test_large_counts() {
306        let file = create_binary_jf(4, 8, &[("ACGT", u64::MAX)]);
307        let reader = ReadMerFile::open(file.path()).unwrap();
308        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
309        assert_eq!(results[0].1, u64::MAX);
310    }
311
312    #[test]
313    fn test_many_records() {
314        let bases = ['A', 'C', 'G', 'T'];
315        let mut records = Vec::new();
316        for &a in &bases {
317            for &b in &bases {
318                for &c in &bases {
319                    records.push(format!("{a}{b}{c}"));
320                }
321            }
322        }
323        let records_with_counts: Vec<(&str, u64)> = records
324            .iter()
325            .enumerate()
326            .map(|(i, s)| (s.as_str(), (i + 1) as u64))
327            .collect();
328
329        let file = create_binary_jf(3, 4, &records_with_counts);
330
331        let reader = ReadMerFile::open(file.path()).unwrap();
332        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
333        assert_eq!(results.len(), 64);
334
335        let qf = QueryMerFile::open(file.path()).unwrap();
336        for (seq, expected_count) in &records_with_counts {
337            let mer: MerDna = seq.parse().unwrap();
338            let count = qf.get(&mer);
339            assert_eq!(count, Some(*expected_count), "query mismatch for {seq}");
340        }
341    }
342}