1pub mod binary;
62pub mod error;
63pub mod header;
64pub mod matrix;
65pub mod mer;
66pub mod query;
67pub mod string_mers;
68pub mod text;
69
70pub use binary::BinaryReader;
72pub use error::{Error, Result};
73pub use header::FileHeader;
74pub use matrix::RectangularBinaryMatrix;
75pub use mer::MerDna;
76pub use query::QueryMerFile;
77pub use string_mers::{StringMers, string_canonicals, string_mers};
78pub use text::TextReader;
79
80use std::fs::File;
81use std::io::BufReader;
82use std::path::Path;
83
84pub enum ReadMerFile {
101 Binary(BinaryReader<BufReader<File>>),
103 Text(TextReader<BufReader<File>>),
105}
106
107impl ReadMerFile {
108 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
112 let file = File::open(path.as_ref())?;
113 let mut reader = BufReader::new(file);
114 let header = FileHeader::read(&mut reader)?;
115
116 match header.format() {
117 header::FORMAT_BINARY_SORTED => {
118 let binary = BinaryReader::new(reader, &header)?;
119 Ok(ReadMerFile::Binary(binary))
120 }
121 header::FORMAT_TEXT_SORTED => {
122 let text = TextReader::new(reader);
123 Ok(ReadMerFile::Text(text))
124 }
125 other => Err(Error::UnsupportedFormat(other.to_string())),
126 }
127 }
128}
129
130impl Iterator for ReadMerFile {
131 type Item = Result<(MerDna, u64)>;
132
133 fn next(&mut self) -> Option<Self::Item> {
134 match self {
135 ReadMerFile::Binary(r) => r.next(),
136 ReadMerFile::Text(r) => r.next(),
137 }
138 }
139}
140
141#[cfg(test)]
142mod integration_tests {
143 use super::*;
144 use std::io::Write;
145 use tempfile::NamedTempFile;
146
147 fn create_binary_jf(k: usize, val_len: usize, records: &[(&str, u64)]) -> NamedTempFile {
149 let key_len_bits = k * 2;
150 let key_len_bytes = (key_len_bits + 7) / 8;
151
152 let mut sorted_records: Vec<(MerDna, u64)> = records
153 .iter()
154 .map(|(s, c)| (s.parse::<MerDna>().unwrap(), *c))
155 .collect();
156 sorted_records.sort_by(|a, b| a.0.cmp(&b.0));
157
158 let size = sorted_records.len().next_power_of_two().max(2);
159 let json = serde_json::json!({
160 "format": "binary/sorted",
161 "key_len": key_len_bits,
162 "val_len": val_len,
163 "counter_len": val_len,
164 "size": size,
165 "canonical": false
166 });
167 let json_str = serde_json::to_string(&json).unwrap();
168
169 let mut file = NamedTempFile::new().unwrap();
170 write!(file, "{:09}", json_str.len()).unwrap();
171 file.write_all(json_str.as_bytes()).unwrap();
172
173 for (mer, count) in &sorted_records {
174 let words = mer.words();
175 let mut bytes_written = 0;
176 for &word in words {
177 for byte_idx in 0..8 {
178 if bytes_written >= key_len_bytes {
179 break;
180 }
181 file.write_all(&[(word >> (byte_idx * 8)) as u8]).unwrap();
182 bytes_written += 1;
183 }
184 }
185 for i in 0..val_len {
186 file.write_all(&[(count >> (i * 8)) as u8]).unwrap();
187 }
188 }
189 file.flush().unwrap();
190 file
191 }
192
193 fn create_text_jf(k: usize, records: &[(&str, u64)]) -> NamedTempFile {
195 let json = serde_json::json!({
196 "format": "text/sorted",
197 "key_len": k * 2,
198 "counter_len": 8
199 });
200 let json_str = serde_json::to_string(&json).unwrap();
201
202 let mut file = NamedTempFile::new().unwrap();
203 write!(file, "{:09}", json_str.len()).unwrap();
204 file.write_all(json_str.as_bytes()).unwrap();
205
206 for (seq, count) in records {
207 writeln!(file, "{seq} {count}").unwrap();
208 }
209 file.flush().unwrap();
210 file
211 }
212
213 #[test]
214 fn test_read_binary_auto_detect() {
215 let file = create_binary_jf(4, 4, &[("ACGT", 42), ("AAAA", 10)]);
216 let reader = ReadMerFile::open(file.path()).unwrap();
217
218 let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
219 assert_eq!(results.len(), 2);
220
221 let strs: Vec<String> = results.iter().map(|(m, _)| m.to_string()).collect();
222 let counts: Vec<u64> = results.iter().map(|(_, c)| *c).collect();
223
224 assert!(strs.contains(&"ACGT".to_string()));
225 assert!(strs.contains(&"AAAA".to_string()));
226
227 let acgt_idx = strs.iter().position(|s| s == "ACGT").unwrap();
228 let aaaa_idx = strs.iter().position(|s| s == "AAAA").unwrap();
229 assert_eq!(counts[acgt_idx], 42);
230 assert_eq!(counts[aaaa_idx], 10);
231 }
232
233 #[test]
234 fn test_read_text_auto_detect() {
235 let file = create_text_jf(4, &[("ACGT", 42), ("AAAA", 10)]);
236 let reader = ReadMerFile::open(file.path()).unwrap();
237
238 let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
239 assert_eq!(results.len(), 2);
240 assert_eq!(results[0].0.to_string(), "ACGT");
241 assert_eq!(results[0].1, 42);
242 assert_eq!(results[1].0.to_string(), "AAAA");
243 assert_eq!(results[1].1, 10);
244 }
245
246 #[test]
247 fn test_binary_and_query_consistent() {
248 let records = [
249 ("AAAA", 10u64),
250 ("ACGT", 42),
251 ("CCCC", 7),
252 ("GGGG", 33),
253 ("TTTT", 100),
254 ];
255 let file = create_binary_jf(4, 4, &records);
256
257 let reader = ReadMerFile::open(file.path()).unwrap();
258 let seq_results: Vec<_> = reader.map(|r| r.unwrap()).collect();
259
260 let qf = QueryMerFile::open(file.path()).unwrap();
261
262 for (mer, count) in &seq_results {
263 let query_count = qf.get(mer);
264 assert_eq!(
265 query_count,
266 Some(*count),
267 "mismatch for {}: sequential={}, query={:?}",
268 mer,
269 count,
270 query_count
271 );
272 }
273 }
274
275 #[test]
276 fn test_roundtrip_various_k() {
277 for k in [1, 2, 4, 8, 16, 25, 31, 32, 33] {
278 let seq: String = "ACGT".chars().cycle().take(k).collect();
279 let file = create_binary_jf(k, 4, &[(&seq, 1)]);
280 let reader = ReadMerFile::open(file.path()).unwrap();
281 let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
282 assert_eq!(results.len(), 1, "k={k}");
283 assert_eq!(results[0].0.to_string(), seq, "k={k}");
284 assert_eq!(results[0].1, 1, "k={k}");
285 }
286 }
287
288 #[test]
289 fn test_empty_binary_file() {
290 let file = create_binary_jf(4, 4, &[]);
291 let reader = ReadMerFile::open(file.path()).unwrap();
292 let results: Vec<_> = reader.collect();
293 assert_eq!(results.len(), 0);
294 }
295
296 #[test]
297 fn test_empty_text_file() {
298 let file = create_text_jf(4, &[]);
299 let reader = ReadMerFile::open(file.path()).unwrap();
300 let results: Vec<_> = reader.collect();
301 assert_eq!(results.len(), 0);
302 }
303
304 #[test]
305 fn test_large_counts() {
306 let file = create_binary_jf(4, 8, &[("ACGT", u64::MAX)]);
307 let reader = ReadMerFile::open(file.path()).unwrap();
308 let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
309 assert_eq!(results[0].1, u64::MAX);
310 }
311
312 #[test]
313 fn test_many_records() {
314 let bases = ['A', 'C', 'G', 'T'];
315 let mut records = Vec::new();
316 for &a in &bases {
317 for &b in &bases {
318 for &c in &bases {
319 records.push(format!("{a}{b}{c}"));
320 }
321 }
322 }
323 let records_with_counts: Vec<(&str, u64)> = records
324 .iter()
325 .enumerate()
326 .map(|(i, s)| (s.as_str(), (i + 1) as u64))
327 .collect();
328
329 let file = create_binary_jf(3, 4, &records_with_counts);
330
331 let reader = ReadMerFile::open(file.path()).unwrap();
332 let results: Vec<_> = reader.map(|r| r.unwrap()).collect();
333 assert_eq!(results.len(), 64);
334
335 let qf = QueryMerFile::open(file.path()).unwrap();
336 for (seq, expected_count) in &records_with_counts {
337 let mer: MerDna = seq.parse().unwrap();
338 let count = qf.get(&mer);
339 assert_eq!(count, Some(*expected_count), "query mismatch for {seq}");
340 }
341 }
342}