Skip to main content

jellyfish_reader/
text.rs

1use std::io::BufRead;
2
3use crate::error::{Error, Result};
4use crate::mer::MerDna;
5
6/// Sequential reader for Jellyfish text/sorted format files.
7///
8/// Reads lines of the form `<kmer_string> <count>` and yields (MerDna, u64) pairs.
9///
10/// # Examples
11///
12/// ```no_run
13/// use std::fs::File;
14/// use std::io::BufReader;
15/// use jellyfish_reader::{FileHeader, TextReader};
16///
17/// let file = File::open("output.txt").unwrap();
18/// let mut reader = BufReader::new(file);
19/// let header = FileHeader::read(&mut reader).unwrap();
20///
21/// let text_reader = TextReader::new(reader);
22/// for result in text_reader {
23///     let (mer, count) = result.unwrap();
24///     println!("{}: {}", mer, count);
25/// }
26/// ```
27pub struct TextReader<R: BufRead> {
28    reader: R,
29    line_buf: String,
30}
31
32impl<R: BufRead> TextReader<R> {
33    /// Create a new text reader from a buffered reader positioned at the start of data.
34    pub fn new(reader: R) -> Self {
35        Self {
36            reader,
37            line_buf: String::new(),
38        }
39    }
40
41    fn read_next(&mut self) -> Result<Option<(MerDna, u64)>> {
42        self.line_buf.clear();
43        let bytes_read = self.reader.read_line(&mut self.line_buf)?;
44        if bytes_read == 0 {
45            return Ok(None);
46        }
47
48        let line = self.line_buf.trim();
49        if line.is_empty() {
50            return Ok(None);
51        }
52
53        // Format: "<kmer> <count>" or "<kmer>\t<count>"
54        let (mer_str, count_str) = line
55            .split_once(|c: char| c.is_whitespace())
56            .ok_or_else(|| Error::InvalidHeader(format!("invalid text format line: {line:?}")))?;
57
58        let mer: MerDna = mer_str.parse()?;
59        let count: u64 = count_str
60            .trim()
61            .parse()
62            .map_err(|_| Error::InvalidHeader(format!("invalid count value: {count_str:?}")))?;
63
64        Ok(Some((mer, count)))
65    }
66}
67
68impl<R: BufRead> Iterator for TextReader<R> {
69    type Item = Result<(MerDna, u64)>;
70
71    fn next(&mut self) -> Option<Self::Item> {
72        match self.read_next() {
73            Ok(Some(pair)) => Some(Ok(pair)),
74            Ok(None) => None,
75            Err(e) => Some(Err(e)),
76        }
77    }
78}
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83    use std::io::Cursor;
84
85    fn make_text_reader(content: &str) -> TextReader<std::io::BufReader<Cursor<Vec<u8>>>> {
86        let cursor = Cursor::new(content.as_bytes().to_vec());
87        TextReader::new(std::io::BufReader::new(cursor))
88    }
89
90    #[test]
91    fn test_read_single_line() {
92        let reader = make_text_reader("ACGT 42\n");
93        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
94        assert_eq!(results.len(), 1);
95        assert_eq!(results[0].0.to_string(), "ACGT");
96        assert_eq!(results[0].1, 42);
97    }
98
99    #[test]
100    fn test_read_multiple_lines() {
101        let reader = make_text_reader("AAAA 10\nACGT 42\nTTTT 100\n");
102        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
103        assert_eq!(results.len(), 3);
104        assert_eq!(results[0].0.to_string(), "AAAA");
105        assert_eq!(results[0].1, 10);
106        assert_eq!(results[1].0.to_string(), "ACGT");
107        assert_eq!(results[1].1, 42);
108        assert_eq!(results[2].0.to_string(), "TTTT");
109        assert_eq!(results[2].1, 100);
110    }
111
112    #[test]
113    fn test_read_tab_separated() {
114        let reader = make_text_reader("ACGT\t42\n");
115        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
116        assert_eq!(results.len(), 1);
117        assert_eq!(results[0].0.to_string(), "ACGT");
118        assert_eq!(results[0].1, 42);
119    }
120
121    #[test]
122    fn test_read_empty() {
123        let reader = make_text_reader("");
124        let results: Vec<_> = reader.collect();
125        assert_eq!(results.len(), 0);
126    }
127
128    #[test]
129    fn test_read_no_trailing_newline() {
130        let reader = make_text_reader("ACGT 42");
131        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
132        assert_eq!(results.len(), 1);
133        assert_eq!(results[0].0.to_string(), "ACGT");
134        assert_eq!(results[0].1, 42);
135    }
136
137    #[test]
138    fn test_read_large_count() {
139        let reader = make_text_reader("ACGT 18446744073709551615\n"); // u64::MAX
140        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
141        assert_eq!(results[0].1, u64::MAX);
142    }
143
144    #[test]
145    fn test_read_longer_kmer() {
146        let reader = make_text_reader("ACGTACGTACGTACGTACGTACGTA 99\n");
147        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
148        assert_eq!(results[0].0.to_string(), "ACGTACGTACGTACGTACGTACGTA");
149        assert_eq!(results[0].0.k(), 25);
150        assert_eq!(results[0].1, 99);
151    }
152
153    #[test]
154    fn test_invalid_count() {
155        let reader = make_text_reader("ACGT notanumber\n");
156        let results: Vec<_> = reader.collect();
157        assert_eq!(results.len(), 1);
158        assert!(results[0].is_err());
159    }
160
161    #[test]
162    fn test_invalid_kmer() {
163        let reader = make_text_reader("ACGN 42\n");
164        let results: Vec<_> = reader.collect();
165        assert_eq!(results.len(), 1);
166        assert!(results[0].is_err());
167    }
168
169    #[test]
170    fn test_malformed_line() {
171        let reader = make_text_reader("justoneword\n");
172        let results: Vec<_> = reader.collect();
173        assert_eq!(results.len(), 1);
174        assert!(results[0].is_err());
175    }
176
177    #[test]
178    fn test_read_with_extra_whitespace() {
179        let reader = make_text_reader("ACGT   42\n");
180        let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
181        assert_eq!(results[0].1, 42);
182    }
183}