Skip to main content

jam_rs/
reader.rs

1use crate::bias::HashBiasTable;
2use crate::format::{
3    BUCKET_COUNT, BUCKET_TABLE_SIZE, BucketMeta, ENTRY_SIZE, Entry, FLAG_HAS_BIAS_TABLE,
4    FormatError, HEADER_SIZE, Header, PAGE_SIZE, bucket_id,
5};
6use crate::writer::FILTER_DESCRIPTOR_SIZE;
7use memmap2::{Mmap, MmapOptions};
8use std::fs::File;
9use std::io;
10use std::path::Path;
11use std::sync::Arc;
12use xorf::{BinaryFuse8Ref, Filter, FilterRef};
13
14#[cfg(unix)]
15use memmap2::{Advice, UncheckedAdvice};
16
17#[derive(Debug, thiserror::Error)]
18pub enum ReaderError {
19    #[error("I/O error: {0}")]
20    Io(#[from] io::Error),
21
22    #[error("Format error: {0}")]
23    Format(#[from] FormatError),
24
25    #[error("Invalid filter data for bucket {bucket}: {message}")]
26    InvalidFilter { bucket: usize, message: String },
27
28    #[error("File too small: expected at least {expected} bytes, got {actual}")]
29    FileTooSmall { expected: usize, actual: usize },
30
31    #[error("Invalid sample data: {message}")]
32    InvalidSampleData { message: String },
33}
34
35#[derive(Debug, Clone)]
36pub struct ReaderStats {
37    pub entry_count: u64,
38    pub unique_hash_count: u64,
39    pub sample_count: u32,
40    pub file_size: u64,
41    pub kmer_size: u8,
42    pub hash_threshold: u64,
43    pub bucket_entry_counts: [u64; BUCKET_COUNT],
44    pub has_bias_table: bool,
45}
46
47struct FilterMeta {
48    descriptor_offset: usize,
49    fingerprints_offset: usize,
50    fingerprints_size: usize,
51}
52
53pub struct BucketFilter<'a> {
54    mmap: &'a [u8],
55    meta: &'a FilterMeta,
56}
57
58impl BucketFilter<'_> {
59    #[inline]
60    pub fn contains(&self, hash: &u64) -> bool {
61        let descriptor = &self.mmap
62            [self.meta.descriptor_offset..self.meta.descriptor_offset + FILTER_DESCRIPTOR_SIZE];
63        let fingerprints = &self.mmap[self.meta.fingerprints_offset
64            ..self.meta.fingerprints_offset + self.meta.fingerprints_size];
65        BinaryFuse8Ref::from_dma(descriptor, fingerprints).contains(hash)
66    }
67}
68
69struct CachedFilterMeta {
70    descriptor_start: usize,
71    descriptor_size: usize,
72    fingerprints_start: usize,
73    fingerprints_size: usize,
74}
75
76pub struct BucketRegion {
77    mmap: Mmap,
78    data_offset: usize,
79    filter_size: usize,
80    entry_count: usize,
81    filter_meta: Option<CachedFilterMeta>,
82}
83
84impl BucketRegion {
85    #[inline]
86    pub fn filter_contains(&self, hash: &u64) -> bool {
87        let meta = match &self.filter_meta {
88            Some(m) => m,
89            None => return false,
90        };
91
92        let descriptor =
93            &self.mmap[meta.descriptor_start..meta.descriptor_start + meta.descriptor_size];
94        let fingerprints =
95            &self.mmap[meta.fingerprints_start..meta.fingerprints_start + meta.fingerprints_size];
96        BinaryFuse8Ref::from_dma(descriptor, fingerprints).contains(hash)
97    }
98
99    #[inline]
100    pub fn entries(&self) -> &[Entry] {
101        if self.entry_count == 0 {
102            return &[];
103        }
104        let start = self.data_offset + self.filter_size;
105        let end = start + self.entry_count * ENTRY_SIZE;
106        bytemuck::cast_slice(&self.mmap[start..end])
107    }
108
109    #[inline]
110    pub fn entry_count(&self) -> usize {
111        self.entry_count
112    }
113
114    #[inline]
115    pub fn is_empty(&self) -> bool {
116        self.filter_size == 0 && self.entry_count == 0
117    }
118}
119
120pub struct JamReader {
121    file: Arc<File>,
122    mmap: Mmap,
123    header: Header,
124    bucket_table: Vec<BucketMeta>,
125    filters: Vec<Option<FilterMeta>>,
126    bias_table: Option<Arc<HashBiasTable>>,
127    sample_names: Vec<String>,
128    sample_sizes: Vec<u64>,
129}
130
131impl JamReader {
132    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, ReaderError> {
133        let file = Arc::new(File::open(path.as_ref())?);
134        let mmap = unsafe { Mmap::map(file.as_ref())? };
135
136        if mmap.len() < HEADER_SIZE {
137            return Err(ReaderError::FileTooSmall {
138                expected: HEADER_SIZE,
139                actual: mmap.len(),
140            });
141        }
142
143        let header: Header = *bytemuck::from_bytes(&mmap[..HEADER_SIZE]);
144        header.validate()?;
145
146        let table_end = HEADER_SIZE + BUCKET_TABLE_SIZE;
147        if mmap.len() < table_end {
148            return Err(ReaderError::FileTooSmall {
149                expected: table_end,
150                actual: mmap.len(),
151            });
152        }
153
154        let bucket_table: Vec<BucketMeta> =
155            bytemuck::cast_slice(&mmap[HEADER_SIZE..table_end]).to_vec();
156
157        let mut filters = Vec::with_capacity(BUCKET_COUNT);
158        for (i, meta) in bucket_table.iter().enumerate() {
159            if meta.filter_size == 0 {
160                filters.push(None);
161                continue;
162            }
163
164            let filter_meta = parse_filter_meta(&mmap, meta, i)?;
165            filters.push(Some(filter_meta));
166        }
167
168        let bias_table = if header.flags & FLAG_HAS_BIAS_TABLE != 0
169            && header.bias_table_offset > 0
170            && header.bias_table_size > 0
171        {
172            let offset = header.bias_table_offset as usize;
173            let size = header.bias_table_size as usize;
174            if offset + size > mmap.len() {
175                return Err(ReaderError::FileTooSmall {
176                    expected: offset + size,
177                    actual: mmap.len(),
178                });
179            }
180            let bias_data = &mmap[offset..offset + size];
181            let table =
182                HashBiasTable::from_bytes(bias_data).map_err(|e| ReaderError::InvalidFilter {
183                    bucket: 0,
184                    message: format!("Failed to parse embedded bias table: {}", e),
185                })?;
186            Some(Arc::new(table))
187        } else {
188            None
189        };
190
191        let sample_names = if header.sample_names_offset > 0 && header.sample_names_size > 0 {
192            let offset = header.sample_names_offset as usize;
193            let size = header.sample_names_size as usize;
194            if offset + size > mmap.len() {
195                return Err(ReaderError::FileTooSmall {
196                    expected: offset + size,
197                    actual: mmap.len(),
198                });
199            }
200            let names = parse_sample_names(&mmap[offset..offset + size], header.sample_count)?;
201            if names.len() != header.sample_count as usize {
202                return Err(ReaderError::InvalidSampleData {
203                    message: format!(
204                        "sample names count mismatch: got {}, expected {}",
205                        names.len(),
206                        header.sample_count
207                    ),
208                });
209            }
210            names
211        } else {
212            (0..header.sample_count)
213                .map(|i| format!("sample_{}", i))
214                .collect()
215        };
216
217        let sample_sizes = if header.sample_sizes_offset > 0 && header.sample_sizes_size > 0 {
218            let offset = header.sample_sizes_offset as usize;
219            let size = header.sample_sizes_size as usize;
220            let expected_size = header.sample_count as usize * 8;
221            if size != expected_size {
222                return Err(ReaderError::InvalidSampleData {
223                    message: format!(
224                        "sample sizes section size mismatch: got {}, expected {}",
225                        size, expected_size
226                    ),
227                });
228            }
229            if offset + size > mmap.len() {
230                return Err(ReaderError::FileTooSmall {
231                    expected: offset + size,
232                    actual: mmap.len(),
233                });
234            }
235            parse_sample_sizes(&mmap[offset..offset + size])
236        } else {
237            vec![0u64; header.sample_count as usize]
238        };
239
240        Ok(Self {
241            file,
242            mmap,
243            header,
244            bucket_table,
245            filters,
246            bias_table,
247            sample_names,
248            sample_sizes,
249        })
250    }
251
252    pub fn open_bucket_region(&self, bucket_idx: usize) -> Result<BucketRegion, ReaderError> {
253        let meta = &self.bucket_table[bucket_idx];
254
255        if meta.filter_size == 0 && meta.entry_count == 0 {
256            let empty_mmap = MmapOptions::new().len(1).map_anon()?.make_read_only()?;
257            return Ok(BucketRegion {
258                mmap: empty_mmap,
259                data_offset: 0,
260                filter_size: 0,
261                entry_count: 0,
262                filter_meta: None,
263            });
264        }
265
266        let region_start = meta.filter_offset as usize;
267        let data_size = meta.filter_size as usize + (meta.entry_count as usize) * ENTRY_SIZE;
268
269        let page_start = region_start & !(PAGE_SIZE - 1);
270        let data_offset = region_start - page_start;
271        let mmap_len = data_offset + data_size;
272
273        let mmap = unsafe {
274            MmapOptions::new()
275                .offset(page_start as u64)
276                .len(mmap_len)
277                .map(self.file.as_ref())?
278        };
279
280        #[cfg(unix)]
281        {
282            let _ = mmap.advise(Advice::Sequential);
283        }
284
285        let filter_meta = if meta.filter_size > 0 {
286            let filter_data_start = data_offset;
287            let filter_data =
288                &mmap[filter_data_start..filter_data_start + meta.filter_size as usize];
289
290            if filter_data.len() >= 8 {
291                let descriptor_size =
292                    u32::from_le_bytes(filter_data[0..4].try_into().unwrap()) as usize;
293                let fingerprints_size =
294                    u32::from_le_bytes(filter_data[4..8].try_into().unwrap()) as usize;
295
296                if descriptor_size != FILTER_DESCRIPTOR_SIZE {
297                    return Err(ReaderError::InvalidFilter {
298                        bucket: bucket_idx,
299                        message: format!(
300                            "unexpected descriptor size in bucket region: {} (expected {})",
301                            descriptor_size, FILTER_DESCRIPTOR_SIZE
302                        ),
303                    });
304                }
305
306                if filter_data.len() >= 8 + descriptor_size + fingerprints_size {
307                    Some(CachedFilterMeta {
308                        descriptor_start: filter_data_start + 8,
309                        descriptor_size,
310                        fingerprints_start: filter_data_start + 8 + descriptor_size,
311                        fingerprints_size,
312                    })
313                } else {
314                    return Err(ReaderError::InvalidFilter {
315                        bucket: bucket_idx,
316                        message: format!(
317                            "filter data truncated: need {} bytes, have {}",
318                            8 + descriptor_size + fingerprints_size,
319                            filter_data.len()
320                        ),
321                    });
322                }
323            } else {
324                return Err(ReaderError::InvalidFilter {
325                    bucket: bucket_idx,
326                    message: format!(
327                        "filter header too small: need 8 bytes, have {}",
328                        filter_data.len()
329                    ),
330                });
331            }
332        } else {
333            None
334        };
335
336        Ok(BucketRegion {
337            mmap,
338            data_offset,
339            filter_size: meta.filter_size as usize,
340            entry_count: meta.entry_count as usize,
341            filter_meta,
342        })
343    }
344
345    #[inline]
346    pub fn bucket_meta(&self, bucket_idx: usize) -> &BucketMeta {
347        &self.bucket_table[bucket_idx]
348    }
349
350    #[inline]
351    pub fn threshold(&self) -> u64 {
352        self.header.hash_threshold
353    }
354
355    #[inline]
356    pub fn kmer_size(&self) -> u8 {
357        self.header.kmer_size
358    }
359
360    #[inline]
361    pub fn bias_table(&self) -> Option<Arc<HashBiasTable>> {
362        self.bias_table.clone()
363    }
364
365    #[inline]
366    pub fn has_bias_table(&self) -> bool {
367        self.bias_table.is_some()
368    }
369
370    pub fn sample_names(&self) -> &[String] {
371        &self.sample_names
372    }
373
374    pub fn sample_name(&self, id: u32) -> Option<&str> {
375        self.sample_names.get(id as usize).map(|s| s.as_str())
376    }
377
378    pub fn sample_sizes(&self) -> &[u64] {
379        &self.sample_sizes
380    }
381
382    pub fn sample_size(&self, id: u32) -> Option<u64> {
383        self.sample_sizes.get(id as usize).copied()
384    }
385
386    pub fn stats(&self) -> ReaderStats {
387        let mut bucket_entry_counts = [0u64; BUCKET_COUNT];
388        for (i, meta) in self.bucket_table.iter().enumerate() {
389            bucket_entry_counts[i] = meta.entry_count;
390        }
391
392        ReaderStats {
393            entry_count: self.header.entry_count,
394            unique_hash_count: self.header.unique_hash_count,
395            sample_count: self.header.sample_count,
396            file_size: self.mmap.len() as u64,
397            kmer_size: self.header.kmer_size,
398            hash_threshold: self.header.hash_threshold,
399            bucket_entry_counts,
400            has_bias_table: self.bias_table.is_some(),
401        }
402    }
403
404    #[inline]
405    pub fn bucket_entries(&self, bucket_idx: usize) -> &[Entry] {
406        let meta = &self.bucket_table[bucket_idx];
407        if meta.entry_count == 0 {
408            return &[];
409        }
410
411        let start = meta.entry_offset as usize;
412        let end = start + (meta.entry_count as usize) * ENTRY_SIZE;
413        bytemuck::cast_slice(&self.mmap[start..end])
414    }
415
416    #[inline]
417    pub fn bucket_entry_byte_range(&self, bucket_idx: usize) -> (usize, usize) {
418        let meta = &self.bucket_table[bucket_idx];
419        let start = meta.entry_offset as usize;
420        let end = start + (meta.entry_count as usize) * ENTRY_SIZE;
421        (start, end)
422    }
423
424    #[inline]
425    pub fn bucket_filter_byte_range(&self, bucket_idx: usize) -> (usize, usize) {
426        let meta = &self.bucket_table[bucket_idx];
427        let start = meta.filter_offset as usize;
428        let end = start + meta.filter_size as usize;
429        (start, end)
430    }
431
432    #[cfg(unix)]
433    pub fn release_pages(&self, start: usize, end: usize) {
434        if start >= end {
435            return;
436        }
437        let page_start = start & !(PAGE_SIZE - 1);
438        let page_end = (end + PAGE_SIZE - 1) & !(PAGE_SIZE - 1);
439        let len = page_end.saturating_sub(page_start);
440        if len > 0 && page_end <= self.mmap.len() {
441            let _ = unsafe {
442                self.mmap
443                    .unchecked_advise_range(UncheckedAdvice::DontNeed, page_start, len)
444            };
445        }
446    }
447
448    #[cfg(not(unix))]
449    pub fn release_pages(&self, _start: usize, _end: usize) {
450    }
451
452    pub fn release_bucket(&self, bucket_idx: usize) {
453        let (filter_start, filter_end) = self.bucket_filter_byte_range(bucket_idx);
454        let (entry_start, entry_end) = self.bucket_entry_byte_range(bucket_idx);
455        self.release_pages(filter_start, filter_end);
456        self.release_pages(entry_start, entry_end);
457    }
458
459    #[cfg(unix)]
460    pub fn advise_random(&self) {
461        let _ = self.mmap.advise(Advice::Random);
462    }
463
464    #[cfg(not(unix))]
465    pub fn advise_random(&self) {
466    }
467
468    #[inline]
469    pub fn bucket_filter(&self, bucket_idx: usize) -> Option<BucketFilter<'_>> {
470        self.filters[bucket_idx].as_ref().map(|meta| BucketFilter {
471            mmap: &self.mmap,
472            meta,
473        })
474    }
475
476    #[inline]
477    pub fn contains(&self, hash: u64) -> bool {
478        let bucket_idx = bucket_id(hash);
479
480        if let Some(filter) = self.bucket_filter(bucket_idx) {
481            if !filter.contains(&hash) {
482                return false;
483            }
484        } else {
485            return false;
486        }
487
488        let entries = self.bucket_entries(bucket_idx);
489        self.interpolation_search(entries, hash).is_some()
490    }
491
492    #[inline]
493    pub fn search(&self, hash: u64) -> impl Iterator<Item = u32> + '_ {
494        let bucket_idx = bucket_id(hash);
495
496        let dominated = self
497            .bucket_filter(bucket_idx)
498            .is_some_and(|f| f.contains(&hash));
499
500        let entries = if dominated {
501            self.bucket_entries(bucket_idx)
502        } else {
503            &[]
504        };
505
506        let start = if entries.is_empty() {
507            0
508        } else {
509            self.interpolation_find_start(entries, hash)
510        };
511
512        entries[start..]
513            .iter()
514            .skip_while(move |e| e.hash < hash)
515            .take_while(move |e| e.hash == hash)
516            .map(|e| e.sample_id)
517    }
518
519    fn interpolation_search(&self, entries: &[Entry], key: u64) -> Option<usize> {
520        if entries.is_empty() {
521            return None;
522        }
523
524        let start = self.interpolation_find_start(entries, key);
525
526        for (i, entry) in entries[start..].iter().enumerate() {
527            if entry.hash == key {
528                return Some(start + i);
529            }
530            if entry.hash > key {
531                break;
532            }
533        }
534
535        None
536    }
537
538    #[inline]
539    fn interpolation_find_start(&self, entries: &[Entry], key: u64) -> usize {
540        let count = entries.len();
541        let threshold = self.threshold();
542
543        let est = ((key as u128 * count as u128) / threshold as u128) as usize;
544
545        let est = est.saturating_sub(16).min(count - 1);
546
547        if entries[est].hash > key {
548            let mut i = est;
549            while i > 0 && entries[i - 1].hash >= key {
550                i -= 1;
551            }
552            i
553        } else {
554            est
555        }
556    }
557}
558
559fn parse_sample_names(data: &[u8], count: u32) -> Result<Vec<String>, ReaderError> {
560    let mut names = Vec::with_capacity(count as usize);
561    let mut offset = 0;
562
563    for i in 0..count {
564        if offset + 2 > data.len() {
565            return Err(ReaderError::InvalidSampleData {
566                message: format!(
567                    "truncated sample names section: cannot read length for sample {}",
568                    i
569                ),
570            });
571        }
572        let len = u16::from_le_bytes(data[offset..offset + 2].try_into().unwrap()) as usize;
573        offset += 2;
574        if offset + len > data.len() {
575            return Err(ReaderError::InvalidSampleData {
576                message: format!(
577                    "truncated sample names section: cannot read name for sample {} (need {} bytes, have {})",
578                    i,
579                    len,
580                    data.len() - offset
581                ),
582            });
583        }
584        names.push(String::from_utf8_lossy(&data[offset..offset + len]).to_string());
585        offset += len;
586    }
587
588    Ok(names)
589}
590
591fn parse_sample_sizes(data: &[u8]) -> Vec<u64> {
592    data.chunks_exact(8)
593        .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
594        .collect()
595}
596
597fn parse_filter_meta(
598    mmap: &Mmap,
599    meta: &BucketMeta,
600    bucket_idx: usize,
601) -> Result<FilterMeta, ReaderError> {
602    let start = meta.filter_offset as usize;
603    let end = start + meta.filter_size as usize;
604
605    if end > mmap.len() {
606        return Err(ReaderError::InvalidFilter {
607            bucket: bucket_idx,
608            message: format!(
609                "filter extends beyond file: {}..{} > {}",
610                start,
611                end,
612                mmap.len()
613            ),
614        });
615    }
616
617    let data = &mmap[start..end];
618
619    if data.len() < 8 {
620        return Err(ReaderError::InvalidFilter {
621            bucket: bucket_idx,
622            message: "filter data too small for header".to_string(),
623        });
624    }
625
626    let descriptor_size = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
627    let fingerprints_size = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
628
629    if descriptor_size != FILTER_DESCRIPTOR_SIZE {
630        return Err(ReaderError::InvalidFilter {
631            bucket: bucket_idx,
632            message: format!(
633                "unexpected descriptor size: {} (expected {})",
634                descriptor_size, FILTER_DESCRIPTOR_SIZE
635            ),
636        });
637    }
638
639    let expected_size = 8 + descriptor_size + fingerprints_size;
640    if data.len() < expected_size {
641        return Err(ReaderError::InvalidFilter {
642            bucket: bucket_idx,
643            message: format!("filter data too small: {} < {}", data.len(), expected_size),
644        });
645    }
646
647    Ok(FilterMeta {
648        descriptor_offset: start + 8,
649        fingerprints_offset: start + 8 + descriptor_size,
650        fingerprints_size,
651    })
652}
653
654#[cfg(test)]
655mod tests {
656    use super::*;
657    use crate::writer::{BuildConfig, build};
658    use std::io::Write;
659    use tempfile::NamedTempFile;
660
661    fn make_fasta(seqs: &[(&str, &str)]) -> NamedTempFile {
662        let mut f = NamedTempFile::with_suffix(".fa").unwrap();
663        for (name, seq) in seqs {
664            writeln!(f, ">{name}").unwrap();
665            writeln!(f, "{seq}").unwrap();
666        }
667        f
668    }
669
670    #[test]
671    fn test_reader_open() {
672        let input = make_fasta(&[("seq1", "ATCGATCGATCGATCGATCGATCGATCGATCG")]);
673        let output_dir = tempfile::tempdir().unwrap();
674        let output_path = output_dir.path().join("test.jam");
675
676        let config = BuildConfig {
677            kmer_size: 11,
678            fscale: 1,
679            num_threads: 2,
680            memory: 1,
681            ..Default::default()
682        };
683
684        build(&[input.path().to_path_buf()], &output_path, &config).unwrap();
685
686        let reader = JamReader::open(&output_path).unwrap();
687        let stats = reader.stats();
688
689        assert!(stats.entry_count > 0);
690        assert_eq!(stats.sample_count, 1);
691        assert_eq!(stats.kmer_size, 11);
692    }
693
694    #[test]
695    fn test_reader_search() {
696        let input = make_fasta(&[("seq1", "ATCGATCGATCGATCGATCGATCGATCGATCG")]);
697        let output_dir = tempfile::tempdir().unwrap();
698        let output_path = output_dir.path().join("test.jam");
699
700        let config = BuildConfig {
701            kmer_size: 11,
702            fscale: 1, // Keep all hashes
703            num_threads: 1,
704            memory: 1,
705            ..Default::default()
706        };
707
708        build(&[input.path().to_path_buf()], &output_path, &config).unwrap();
709
710        let reader = JamReader::open(&output_path).unwrap();
711
712        let entries = reader.bucket_entries(0);
713        if !entries.is_empty() {
714            let test_hash = entries[0].hash;
715            assert!(reader.contains(test_hash));
716
717            let samples: Vec<_> = reader.search(test_hash).collect();
718            assert!(!samples.is_empty());
719        }
720    }
721
722    #[test]
723    fn test_reader_nonexistent_hash() {
724        let input = make_fasta(&[("seq1", "ATCGATCGATCGATCGATCGATCGATCGATCG")]);
725        let output_dir = tempfile::tempdir().unwrap();
726        let output_path = output_dir.path().join("test.jam");
727
728        let config = BuildConfig {
729            kmer_size: 11,
730            fscale: 1000, // Keep only ~0.1% of hashes
731            num_threads: 1,
732            memory: 1,
733            ..Default::default()
734        };
735
736        build(&[input.path().to_path_buf()], &output_path, &config).unwrap();
737
738        let reader = JamReader::open(&output_path).unwrap();
739
740        let fake_hash = u64::MAX - 1;
741        assert!(!reader.contains(fake_hash));
742
743        let samples: Vec<_> = reader.search(fake_hash).collect();
744        assert!(samples.is_empty());
745    }
746
747    #[test]
748    fn test_reader_multiple_samples() {
749        let input = make_fasta(&[
750            ("seq1", "ATCGATCGATCGATCGATCGATCGATCGATCG"),
751            ("seq2", "ATCGATCGATCGATCGATCGATCGATCGATCG"),
752        ]);
753        let output_dir = tempfile::tempdir().unwrap();
754        let output_path = output_dir.path().join("test.jam");
755
756        let config = BuildConfig {
757            kmer_size: 11,
758            fscale: 1,
759            singleton: true, // Each sequence is a separate sample
760            num_threads: 1,
761            memory: 1,
762            ..Default::default()
763        };
764
765        build(&[input.path().to_path_buf()], &output_path, &config).unwrap();
766
767        let reader = JamReader::open(&output_path).unwrap();
768        assert_eq!(reader.stats().sample_count, 2);
769
770        for bucket_idx in 0..BUCKET_COUNT {
771            let entries = reader.bucket_entries(bucket_idx);
772            if entries.len() >= 2 {
773                let test_hash = entries[0].hash;
774                let samples: Vec<_> = reader.search(test_hash).collect();
775                if samples.len() == 2 {
776                    assert!(samples.contains(&0) || samples.contains(&1));
777                    return;
778                }
779            }
780        }
781    }
782
783    #[test]
784    fn test_reader_bucket_entries() {
785        let input = make_fasta(&[("seq1", "ATCGATCGATCGATCGATCGATCGATCGATCG")]);
786        let output_dir = tempfile::tempdir().unwrap();
787        let output_path = output_dir.path().join("test.jam");
788
789        let config = BuildConfig {
790            kmer_size: 11,
791            fscale: 1,
792            num_threads: 1,
793            memory: 1,
794            ..Default::default()
795        };
796
797        build(&[input.path().to_path_buf()], &output_path, &config).unwrap();
798
799        let reader = JamReader::open(&output_path).unwrap();
800
801        for bucket_idx in 0..BUCKET_COUNT {
802            let entries = reader.bucket_entries(bucket_idx);
803            for window in entries.windows(2) {
804                assert!(
805                    window[0] <= window[1],
806                    "Entries not sorted in bucket {}",
807                    bucket_idx
808                );
809            }
810
811            for entry in entries {
812                assert_eq!(bucket_id(entry.hash), bucket_idx);
813            }
814        }
815    }
816}