Skip to main content

fgumi_lib/sort/
mod.rs

1//! High-performance BAM sorting module.
2//!
3//! This module provides efficient BAM file sorting with support for multiple sort orders:
4//! - **Template-coordinate**: Groups paired-end reads by template position (for `fgumi group`)
5//! - **Queryname**: Groups reads by read name (for `fgumi zipper`)
6//! - **Coordinate**: Standard genomic coordinate order (for IGV, `fgumi review`)
7//!
8//! # Performance Features
9//!
10//! - **External merge-sort**: Handles BAM files larger than available RAM via spill-to-disk
11//! - **Lazy decoding**: Only parses fields needed for sort key extraction
12//! - **Parallel sorting**: Uses rayon for in-memory parallel sort
13//! - **Buffer recycling**: Reuses buffers via channel-based allocation patterns
14//! - **Fast compression**: Uses libdeflate for temporary file compression
15//!
16//! # Architecture
17//!
18//! The sorting process follows this pipeline:
19//!
20//! 1. **Read phase**: Stream BAM records, extract sort keys lazily
21//! 2. **Accumulate phase**: Buffer records until memory limit reached
22//! 3. **Sort phase**: Parallel sort in-memory records using rayon
23//! 4. **Spill phase**: Compress and write sorted chunk to temp file
24//! 5. **Merge phase**: K-way merge of sorted temp files using min-heap
25
26use std::path::Path;
27
28use anyhow::{Context, Result};
29use tempfile::TempDir;
30
31pub use fgumi_raw_bam as bam_fields;
32pub mod external;
33pub mod inline_buffer;
34pub mod keys;
35pub mod pipeline;
36pub mod radix;
37pub mod raw;
38pub mod raw_bam_reader;
39pub mod read_ahead;
40
41/// Buffer size for `BufReader` during merge phase.
42const MERGE_BUFFER_SIZE: usize = 64 * 1024;
43
44/// Statistics from a sort operation.
45#[derive(Default, Debug)]
46pub struct SortStats {
47    /// Total records read from input.
48    pub total_records: u64,
49    /// Records written to output.
50    pub output_records: u64,
51    /// Number of temporary chunk files written.
52    pub chunks_written: usize,
53}
54
55/// Create a temporary directory for sort spill files.
56fn create_temp_dir(base: Option<&Path>) -> Result<TempDir> {
57    match base {
58        Some(base) => {
59            std::fs::create_dir_all(base)?;
60            TempDir::new_in(base).context("Failed to create temp directory")
61        }
62        None => TempDir::new().context("Failed to create temp directory"),
63    }
64}
65
66pub use external::ExternalSorter;
67pub use inline_buffer::{TemplateKey, extract_coordinate_key_inline};
68pub use keys::{
69    CoordinateKey, PA_TAG, PrimaryAlignmentInfo, QuerynameKey, RawCoordinateKey, RawQuerynameKey,
70    RawSortKey, SortContext, SortKey, SortOrder,
71};
72pub use pipeline::{ParallelMergeConfig, parallel_merge, parallel_merge_buffered};
73pub use raw::{LibraryLookup, RawExternalSorter, extract_template_key_inline};
74
75#[cfg(test)]
76mod tests {
77    use super::*;
78
79    #[test]
80    fn test_create_temp_dir_default() {
81        let dir = create_temp_dir(None).unwrap();
82        assert!(dir.path().exists());
83    }
84
85    #[test]
86    fn test_create_temp_dir_with_base() {
87        let base = tempfile::tempdir().unwrap();
88        let subdir = base.path().join("sort_spill");
89        let dir = create_temp_dir(Some(&subdir)).unwrap();
90        assert!(dir.path().exists());
91        assert!(dir.path().starts_with(&subdir));
92    }
93
94    #[test]
95    fn test_sort_stats_default() {
96        let stats = SortStats::default();
97        assert_eq!(stats.total_records, 0);
98        assert_eq!(stats.output_records, 0);
99        assert_eq!(stats.chunks_written, 0);
100    }
101}