sshash_lib/builder/minimizer_tuples.rs
1//! Minimizer tuple computation for the build pipeline
2//!
3//! This module extracts (minimizer, position, count) tuples from the
4//! encoded strings, which are then used to build the sparse/skew index.
5//!
6//! **Important**: MinimizerTuple is used ONLY during index construction and is
7//! NOT stored in the final dictionary. It's an intermediate representation that
8//! gets converted into the sparse/skew index structure.
9//!
10//! ## Parallelism
11//!
12//! Minimizer tuple extraction is parallelized across SPSS strings using rayon.
13//! Each string is processed independently (super-k-mers never cross string
14//! boundaries), producing a local `Vec<MinimizerTuple>`. Results are collected
15//! and sorted in parallel via `par_sort_unstable()`.
16//!
17//! ## External Sorting
18//!
19//! For large datasets that exceed RAM limits, external sorting is available via
20//! [`compute_minimizer_tuples_external`]. This follows the C++ implementation:
21//! - Each thread has a RAM-bounded buffer
22//! - When buffer fills → parallel sort + flush to temp binary file
23//! - After all tuples → k-way merge of temp files using loser tree
24
25use std::cmp::Ordering;
26use std::sync::Arc;
27
28use rayon::prelude::*;
29use tracing::info;
30
31use crate::{
32 kmer::{Kmer, KmerBits},
33 minimizer::MinimizerIterator,
34 spectrum_preserving_string_set::SpectrumPreservingStringSet,
35};
36use super::config::BuildConfiguration;
37use super::external_sort::{ExternalSorter, MinimizerTupleExternal, GIB};
38
39/// A minimizer tuple representing a k-mer's minimizer and its position
40///
41/// This is the fundamental unit for indexing. Each tuple represents a
42/// minimizer occurrence in the input and tracks where it appears.
43///
44/// # Canonical Mode Orientation Handling
45///
46/// In canonical mode, when a k-mer's reverse complement minimizer is smaller
47/// than the forward minimizer, we use the RC minimizer BUT adjust `pos_in_kmer`
48/// to reflect the position in the **forward** k-mer:
49///
50/// ```text
51/// If RC minimizer is chosen:
52/// pos_in_kmer = (k - m) - original_rc_pos
53/// ```
54///
55/// This elegantly encodes orientation implicitly without needing a separate flag.
56/// The minimizer position is always relative to the forward k-mer representation.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub struct MinimizerTuple {
59 /// The minimizer value (m-mer)
60 pub minimizer: u64,
61
62 /// Position of the minimizer in the sequence (in bases from string start)
63 pub pos_in_seq: u64,
64
65 /// Position of the minimizer within the k-mer (0 to k-m)
66 ///
67 /// In canonical mode, this is ALWAYS relative to the forward k-mer,
68 /// even when the RC minimizer is chosen (adjusted accordingly).
69 pub pos_in_kmer: u8,
70
71 /// Number of consecutive k-mers that share this minimizer (super-k-mer size)
72 pub num_kmers_in_super_kmer: u8,
73}
74
75impl MinimizerTuple {
76 /// Create a new minimizer tuple
77 pub fn new(minimizer: u64, pos_in_seq: u64, pos_in_kmer: u8) -> Self {
78 Self {
79 minimizer,
80 pos_in_seq,
81 pos_in_kmer,
82 num_kmers_in_super_kmer: 1,
83 }
84 }
85}
86
87impl PartialOrd for MinimizerTuple {
88 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
89 Some(self.cmp(other))
90 }
91}
92
93impl Ord for MinimizerTuple {
94 fn cmp(&self, other: &Self) -> Ordering {
95 // Primary sort by minimizer value
96 match self.minimizer.cmp(&other.minimizer) {
97 Ordering::Equal => {
98 // Secondary sort by position in sequence
99 self.pos_in_seq.cmp(&other.pos_in_seq)
100 }
101 other => other,
102 }
103 }
104}
105
106/// Compute minimizer tuples from an encoded SPSS (parallelized with rayon)
107///
108/// This function extracts all k-mers from the SPSS, computes their minimizers
109/// (with canonical mode handling), and produces sorted, coalesced MinimizerTuples.
110///
111/// # Parallelism
112///
113/// Strings are processed in parallel via rayon. Each string is independent:
114/// super-k-mers never cross string boundaries, so per-string coalescing is
115/// correct. The resulting tuples are merged and sorted in parallel using
116/// `par_sort_unstable()`.
117///
118/// The rayon thread pool size is controlled by the caller (typically
119/// `DictionaryBuilder` installs a pool sized to `config.num_threads`).
120///
121/// # Canonical Mode Orientation
122///
123/// When canonical mode is enabled, for each k-mer we compute both forward and
124/// reverse-complement minimizers. If the RC minimizer is smaller, we use it BUT
125/// adjust the position: `pos_in_kmer = (k - m) - rc_pos`. This ensures pos_in_kmer
126/// always represents the position relative to the forward k-mer.
127///
128/// # Returns
129///
130/// A sorted vector of MinimizerTuples, with consecutive k-mers sharing the same
131/// minimizer coalesced into super-k-mers (num_kmers_in_super_kmer > 1).
132///
133/// IMPORTANT: Coalescing happens DURING extraction (while iterating k-mers in sequence),
134/// not after sorting. This matches the C++ implementation exactly.
135pub fn compute_minimizer_tuples<const K: usize>(
136 spss: &SpectrumPreservingStringSet,
137 config: &BuildConfiguration,
138) -> Vec<MinimizerTuple>
139where
140 Kmer<K>: KmerBits,
141{
142 let k = K;
143 let m = config.m;
144
145 if k <= m {
146 panic!("k must be > m (k={}, m={})", k, m);
147 }
148
149 let num_strings = spss.num_strings() as usize;
150
151 // Process strings in parallel — each string produces locally coalesced tuples.
152 // rayon's flat_map_iter collects per-string Vec results efficiently.
153 let mut tuples: Vec<MinimizerTuple> = (0..num_strings)
154 .into_par_iter()
155 .flat_map_iter(|string_id| {
156 extract_tuples_for_string::<K>(spss, string_id as u64, k, m, config)
157 })
158 .collect();
159
160 // Parallel sort by (minimizer, pos_in_seq) — exactly like C++
161 // This is the only sorting step; coalescing has already happened during extraction
162 tuples.par_sort_unstable();
163
164 tuples
165}
166
167/// Extract minimizer tuples from a single string in the SPSS
168///
169/// Processes one string sequentially, maintaining MinimizerIterator state
170/// for efficient sliding-window minimizer computation. Coalesces consecutive
171/// k-mers sharing the same minimizer into super-k-mers inline.
172fn extract_tuples_for_string<const K: usize>(
173 spss: &SpectrumPreservingStringSet,
174 string_id: u64,
175 k: usize,
176 m: usize,
177 config: &BuildConfiguration,
178) -> Vec<MinimizerTuple>
179where
180 Kmer<K>: KmerBits,
181{
182 let string_len = spss.string_length(string_id);
183
184 if string_len < k {
185 return Vec::new();
186 }
187
188 let num_kmers = string_len - k + 1;
189 let mut tuples = Vec::new();
190
191 // CREATE MINIMIZER ITERATOR ONCE PER STRING (matching C++ behavior!)
192 // Maintains state across k-mers for efficient super-k-mer detection
193 let mut minimizer_iter = MinimizerIterator::with_seed(k, m, config.seed);
194
195 // Initialize iterator with absolute string start position (matching C++ behavior)
196 let string_begin = spss.string_offset(string_id);
197 minimizer_iter.set_position(string_begin);
198
199 // COALESCING DURING EXTRACTION - matching C++ behavior exactly
200 // Track the previous minimizer info to detect when it changes
201 let mut prev_mini_tuple: Option<MinimizerTuple> = None;
202 let mut num_kmers_in_super_kmer = 0u8;
203
204 for kmer_pos in 0..num_kmers {
205 // Decode k-mer from SPSS
206 let kmer = spss.decode_kmer::<K>(string_id, kmer_pos);
207
208 // Compute forward minimizer using PERSISTENT iterator state
209 let forward_minimizer = minimizer_iter.next(kmer);
210
211 let (final_minimizer, final_pos_in_kmer) = if config.canonical {
212 // Compute RC minimizer using a FRESH iterator (not sequential).
213 // RC k-mers slide in the opposite direction (new base at front,
214 // not at end), so the sequential sliding optimization gives wrong
215 // results. A fresh iterator always does a full rescan, matching
216 // the query-time behavior exactly.
217 let kmer_rc = kmer.reverse_complement();
218 let mut fresh_rc_iter = MinimizerIterator::with_seed(k, m, config.seed);
219 let rc_minimizer = fresh_rc_iter.next(kmer_rc);
220
221 // Choose smaller minimizer (compare by value, which is deterministic)
222 if rc_minimizer.value < forward_minimizer.value {
223 // RC minimizer wins - adjust position to forward k-mer frame
224 let adjusted_pos = (k - m) as u8 - rc_minimizer.pos_in_kmer as u8;
225 (rc_minimizer.value, adjusted_pos)
226 } else {
227 (forward_minimizer.value, forward_minimizer.pos_in_kmer as u8)
228 }
229 } else {
230 // Non-canonical mode: just use forward minimizer
231 (forward_minimizer.value, forward_minimizer.pos_in_kmer as u8)
232 };
233
234 // Store absolute position of minimizer in concatenated SPSS
235 // (matching C++ decoded_offsets approach)
236 // absolute_pos = string_begin + kmer_pos + pos_in_kmer
237 // During lookup, we recover: kmer_start = absolute_pos - pos_in_kmer
238 let absolute_pos_in_seq = string_begin + kmer_pos as u64 + final_pos_in_kmer as u64;
239
240 let current_mini = MinimizerTuple {
241 minimizer: final_minimizer,
242 pos_in_seq: absolute_pos_in_seq,
243 pos_in_kmer: final_pos_in_kmer,
244 num_kmers_in_super_kmer: 1, // Will be updated during coalescing
245 };
246
247 // Check if this minimizer matches the previous one
248 // Per C++ code: check only minimizer and pos_in_seq (NOT pos_in_kmer)
249 // See compute_minimizer_tuples.cpp line 99-107
250 if let Some(prev) = prev_mini_tuple.take() {
251 if current_mini.minimizer == prev.minimizer && current_mini.pos_in_seq == prev.pos_in_seq {
252 // Same minimizer occurrence - part of same super-k-mer
253 num_kmers_in_super_kmer += 1;
254 prev_mini_tuple = Some(prev);
255 } else {
256 // Different minimizer - save previous and start a new one
257 let mut saved = prev;
258 saved.num_kmers_in_super_kmer = num_kmers_in_super_kmer;
259 tuples.push(saved);
260
261 // Start tracking the new minimizer
262 prev_mini_tuple = Some(current_mini);
263 num_kmers_in_super_kmer = 1;
264 }
265 } else {
266 // First k-mer in this string
267 prev_mini_tuple = Some(current_mini);
268 num_kmers_in_super_kmer = 1;
269 }
270 }
271
272 // Don't forget to save the last accumulated tuple for this string
273 if let Some(mut last) = prev_mini_tuple.take() {
274 last.num_kmers_in_super_kmer = num_kmers_in_super_kmer;
275 tuples.push(last);
276 }
277
278 tuples
279}
280
281/// Estimate memory needed for in-memory tuple processing
282///
283/// Returns estimated bytes needed to hold all tuples in memory.
284/// Uses heuristic: ~1 tuple per k-mer on average (conservative).
285pub fn estimate_memory_bytes(total_kmers: u64) -> u64 {
286 // Each tuple is roughly 24 bytes in Rust (with alignment)
287 // Plus sorting needs temporary space (~2x)
288 total_kmers * 24 * 2
289}
290
291/// Check if external sorting is needed based on estimated data size and RAM limit
292///
293/// Returns false if ram_limit_gib is 0 (meaning unlimited/in-memory)
294pub fn needs_external_sorting(total_kmers: u64, ram_limit_gib: usize) -> bool {
295 if ram_limit_gib == 0 {
296 return false; // 0 means unlimited, use in-memory sorting
297 }
298 let estimated_bytes = estimate_memory_bytes(total_kmers);
299 let ram_bytes = (ram_limit_gib as u64) * (GIB as u64);
300 estimated_bytes > ram_bytes
301}
302
303/// Compute minimizer tuples using external sorting (RAM-bounded)
304///
305/// This follows the C++ implementation precisely:
306/// - Each thread partition has its own buffer (sized by RAM limit)
307/// - When buffer fills → parallel sort + flush to temp binary file
308/// - After all strings processed → k-way merge of temp files
309///
310/// Use this for large datasets that exceed available RAM.
311///
312/// # Arguments
313/// * `spss` - The spectrum-preserving string set
314/// * `config` - Build configuration (includes RAM limit, threads, tmp dir)
315///
316/// # Returns
317/// Sorted vector of MinimizerTuples (loaded from merged temp file).
318/// For large datasets, prefer [`compute_minimizer_tuples_external_mmap`] which
319/// returns an mmap'd view instead of materializing all tuples.
320pub fn compute_minimizer_tuples_external<const K: usize>(
321 spss: &SpectrumPreservingStringSet,
322 config: &BuildConfiguration,
323) -> Result<Vec<MinimizerTuple>, std::io::Error>
324where
325 Kmer<K>: KmerBits,
326{
327 let sorter = run_external_sort::<K>(spss, config)?;
328 sorter.read_merged_tuples()
329}
330
331/// Compute minimizer tuples using external sorting, returning a file handle.
332///
333/// Like [`compute_minimizer_tuples_external`], but returns a [`FileTuples`]
334/// instead of materializing all tuples in memory. This is the production path
335/// for large datasets — tuples are accessed sequentially via buffered I/O.
336pub fn compute_minimizer_tuples_external_file<const K: usize>(
337 spss: &SpectrumPreservingStringSet,
338 config: &BuildConfiguration,
339) -> Result<super::external_sort::FileTuples, std::io::Error>
340where
341 Kmer<K>: KmerBits,
342{
343 let sorter = run_external_sort::<K>(spss, config)?;
344 sorter.open_merged_file()
345}
346
347/// Run the external sort pipeline (shared by both materialized and mmap paths).
348///
349/// Returns the ExternalSorter after merge is complete.
350fn run_external_sort<const K: usize>(
351 spss: &SpectrumPreservingStringSet,
352 config: &BuildConfiguration,
353) -> Result<ExternalSorter, std::io::Error>
354where
355 Kmer<K>: KmerBits,
356{
357 let k = K;
358 let m = config.m;
359
360 if k <= m {
361 panic!("k must be > m (k={}, m={})", k, m);
362 }
363
364 let num_threads = if config.num_threads == 0 {
365 rayon::current_num_threads()
366 } else {
367 config.num_threads
368 };
369
370 let sorter = Arc::new(ExternalSorter::new(
371 &config.tmp_dirname,
372 config.ram_limit_gib,
373 num_threads,
374 config.verbose,
375 )?);
376
377 let buffer_size = sorter.buffer_size_per_thread();
378 info!(
379 "External sorting: {} threads, {} GiB RAM limit, {} tuples/buffer",
380 num_threads, config.ram_limit_gib, buffer_size
381 );
382
383 let num_strings = spss.num_strings() as usize;
384 let num_strings_per_thread = num_strings.div_ceil(num_threads);
385
386 // Process strings in thread partitions, each with its own buffer
387 // This matches C++ behavior exactly
388 std::thread::scope(|scope| {
389 let mut handles = Vec::with_capacity(num_threads);
390
391 for t in 0..num_threads {
392 let begin = t * num_strings_per_thread;
393 if begin >= num_strings {
394 break;
395 }
396 let end = (begin + num_strings_per_thread).min(num_strings);
397
398 let sorter = Arc::clone(&sorter);
399 let handle = scope.spawn(move || {
400 process_string_range_external::<K>(
401 spss,
402 begin..end,
403 k,
404 m,
405 config,
406 &sorter,
407 buffer_size,
408 )
409 });
410 handles.push(handle);
411 }
412
413 // Wait for all threads
414 for handle in handles {
415 handle.join().expect("Thread panicked")?;
416 }
417
418 Ok::<(), std::io::Error>(())
419 })?;
420
421 // Merge all temp files
422 let _merge_result = sorter.merge()?;
423
424 // Unwrap the Arc — we're the only holder at this point
425 Arc::try_unwrap(sorter).map_err(|_| {
426 std::io::Error::other("ExternalSorter still has multiple owners")
427 })
428}
429
430/// Process a range of strings, flushing to disk when buffer fills
431fn process_string_range_external<const K: usize>(
432 spss: &SpectrumPreservingStringSet,
433 string_range: std::ops::Range<usize>,
434 k: usize,
435 m: usize,
436 config: &BuildConfiguration,
437 sorter: &ExternalSorter,
438 buffer_size: usize,
439) -> Result<(), std::io::Error>
440where
441 Kmer<K>: KmerBits,
442{
443 let mut buffer: Vec<MinimizerTupleExternal> = Vec::with_capacity(buffer_size);
444
445 for string_id in string_range {
446 let string_len = spss.string_length(string_id as u64);
447
448 if string_len < k {
449 continue;
450 }
451
452 let num_kmers = string_len - k + 1;
453
454 // Create minimizer iterator for this string
455 let mut minimizer_iter = MinimizerIterator::with_seed(k, m, config.seed);
456 let string_begin = spss.string_offset(string_id as u64);
457 minimizer_iter.set_position(string_begin);
458
459 // Track previous minimizer for coalescing
460 let mut prev_mini: Option<MinimizerTupleExternal> = None;
461 let mut num_kmers_in_super_kmer: u8 = 0;
462
463 for kmer_pos in 0..num_kmers {
464 let kmer = spss.decode_kmer::<K>(string_id as u64, kmer_pos);
465 let forward_minimizer = minimizer_iter.next(kmer);
466
467 let (final_minimizer, final_pos_in_kmer) = if config.canonical {
468 let kmer_rc = kmer.reverse_complement();
469 let mut fresh_rc_iter = MinimizerIterator::with_seed(k, m, config.seed);
470 let rc_minimizer = fresh_rc_iter.next(kmer_rc);
471
472 if rc_minimizer.value < forward_minimizer.value {
473 let adjusted_pos = (k - m) as u8 - rc_minimizer.pos_in_kmer as u8;
474 (rc_minimizer.value, adjusted_pos)
475 } else {
476 (forward_minimizer.value, forward_minimizer.pos_in_kmer as u8)
477 }
478 } else {
479 (forward_minimizer.value, forward_minimizer.pos_in_kmer as u8)
480 };
481
482 let absolute_pos_in_seq = string_begin + kmer_pos as u64 + final_pos_in_kmer as u64;
483
484 let current = MinimizerTupleExternal {
485 minimizer: final_minimizer,
486 pos_in_seq: absolute_pos_in_seq,
487 pos_in_kmer: final_pos_in_kmer,
488 num_kmers_in_super_kmer: 1,
489 };
490
491 // Coalescing logic (matching C++: check minimizer and pos_in_seq only)
492 // pos_in_kmer changes with each consecutive k-mer in a super-k-mer,
493 // so we must NOT compare it here — we keep the pos_in_kmer from the
494 // first k-mer in the run (stored in `prev`).
495 if let Some(mut prev) = prev_mini.take() {
496 if current.minimizer == prev.minimizer
497 && current.pos_in_seq == prev.pos_in_seq
498 {
499 // Same super-k-mer
500 num_kmers_in_super_kmer = num_kmers_in_super_kmer.saturating_add(1);
501 prev_mini = Some(prev);
502 } else {
503 // Different - flush previous
504 prev.num_kmers_in_super_kmer = num_kmers_in_super_kmer;
505
506 // Check if buffer is full
507 if buffer.len() >= buffer_size {
508 let mut buf = std::mem::take(&mut buffer);
509 sorter.sort_and_flush(&mut buf)?;
510 buffer = buf; // Reuse the allocation
511 }
512 buffer.push(prev);
513
514 prev_mini = Some(current);
515 num_kmers_in_super_kmer = 1;
516 }
517 } else {
518 prev_mini = Some(current);
519 num_kmers_in_super_kmer = 1;
520 }
521 }
522
523 // Flush last tuple for this string
524 if let Some(mut last) = prev_mini.take() {
525 last.num_kmers_in_super_kmer = num_kmers_in_super_kmer;
526
527 if buffer.len() >= buffer_size {
528 let mut buf = std::mem::take(&mut buffer);
529 sorter.sort_and_flush(&mut buf)?;
530 buffer = buf;
531 }
532 buffer.push(last);
533 }
534 }
535
536 // Flush remaining buffer
537 if !buffer.is_empty() {
538 let mut buf = buffer;
539 sorter.sort_and_flush(&mut buf)?;
540 }
541
542 Ok(())
543}
544
545#[cfg(test)]
546mod tests {
547 use super::*;
548
549 #[test]
550 fn test_minimizer_tuple_creation() {
551 let tuple = MinimizerTuple::new(12345, 100, 5);
552 assert_eq!(tuple.minimizer, 12345);
553 assert_eq!(tuple.pos_in_seq, 100);
554 assert_eq!(tuple.pos_in_kmer, 5);
555 assert_eq!(tuple.num_kmers_in_super_kmer, 1);
556 }
557
558 #[test]
559 fn test_minimizer_tuple_ordering() {
560 let t1 = MinimizerTuple::new(100, 50, 0);
561 let t2 = MinimizerTuple::new(200, 50, 0);
562 let t3 = MinimizerTuple::new(100, 100, 0);
563
564 assert!(t1 < t2); // Different minimizer
565 assert!(t1 < t3); // Same minimizer, different position
566 assert!(t2 > t1);
567 }
568
569 #[test]
570 fn test_compute_minimizer_tuples_basic() {
571 use crate::builder::config::BuildConfiguration;
572 use crate::builder::encode::Encoder;
573
574 // Build a simple SPSS with a single short sequence
575 let config = BuildConfiguration::new(31, 13).unwrap();
576 let mut encoder = Encoder::<31>::new();
577
578 // Add a sequence that's exactly 31 bases (1 k-mer)
579 let sequence = "ACGTACGTACGTACGTACGTACGTACGTACG";
580 encoder.add_sequence(sequence.as_bytes()).unwrap();
581
582 let spss = encoder.build(config.m);
583
584 // Compute minimizer tuples
585 let tuples = compute_minimizer_tuples::<31>(&spss, &config);
586
587 // Should have exactly 1 tuple (1 k-mer)
588 assert_eq!(tuples.len(), 1);
589 assert!(tuples[0].pos_in_seq < sequence.len() as u64);
590 assert_eq!(tuples[0].num_kmers_in_super_kmer, 1);
591 }
592
593 #[test]
594 fn test_compute_minimizer_tuples_canonical() {
595 use crate::builder::config::BuildConfiguration;
596 use crate::builder::encode::Encoder;
597
598 // Build SPSS in canonical mode
599 let mut config = BuildConfiguration::new(31, 13).unwrap();
600 config.canonical = true;
601 let mut encoder = Encoder::<31>::new();
602
603 // Add a sequence
604 let sequence = "ACGTACGTACGTACGTACGTACGTACGTACG";
605 encoder.add_sequence(sequence.as_bytes()).unwrap();
606
607 let spss = encoder.build(config.m);
608
609 // Compute minimizer tuples with canonical mode
610 let tuples = compute_minimizer_tuples::<31>(&spss, &config);
611
612 // Should have tuples with canonical minimizers
613 assert!(!tuples.is_empty());
614
615 // In canonical mode, pos_in_kmer should be adjusted when RC minimizer wins
616 // This is a structural test - the actual values depend on the hashing
617 for tuple in &tuples {
618 assert!(tuple.pos_in_kmer <= (31 - 13) as u8);
619 }
620 }
621}