caps_sa/
sample_sort.rs

1//! In-memory CaPS-SA-style suffix array construction.
2//!
3//! Phase 1 of the port: a parallel merge-sort with LCP-enhanced two-way merge,
4//! exactly the inner sorting kernel of upstream CaPS-SA's `Suffix_Array::merge`
5//! and `Suffix_Array::merge_sort` (see `include/Suffix_Array.hpp` and
6//! `src/Suffix_Array.cpp`). The sample-sort partitioning around this kernel
7//! (`select_pivots` → `distribute_sub_subarrays` → `merge_sub_subarrays`) is
8//! Phase 2 / 3 work; the kernel here already produces a correct LCP-annotated
9//! suffix array, and `rayon::join` gives parallel divide for free.
10//!
11//! The LCP-enhanced merge maintains:
12//!
13//! * `m` — the LCP between the last-output element and the current top of the
14//!   *other* stream.
15//! * `l_a` = `lcp_a[i_a]` — the LCP between the current top of the
16//!   last-output stream and its immediate predecessor (which is the
17//!   last-output element).
18//!
19//! Three cases per step:
20//!
21//! * `l_a > m`: the next candidate from the last-output stream agrees with the
22//!   last-output element past where the other stream diverged — it lies on the
23//!   same side of the other stream's top as the last-output element did, so it
24//!   wins. No symbol comparison needed.
25//! * `l_a < m`: the next candidate diverges from the last-output element
26//!   *inside* the prefix shared with the other stream's top. Since the stream
27//!   is sorted, the new candidate is larger than its predecessor; at the
28//!   divergence offset it therefore exceeds the other stream's top — the
29//!   other stream wins. No symbol comparison needed.
30//! * `l_a == m`: undetermined; extend the LCP from offset `m` by an actual
31//!   symbol scan and compare.
32
33use crate::Index;
34use crate::lcp::{LcpDispatch, Symbol};
35use crate::limits::{LimitProvider, PlainText};
36use rayon::join;
37
38/// Tunable options for SA construction.
39#[derive(Clone, Debug)]
40pub struct Opts {
41    /// Bound on extension comparisons inside the merge. `usize::MAX` (default)
42    /// is unbounded — required for full lexicographic correctness when the
43    /// caller's text doesn't guarantee comparisons terminate via sentinels
44    /// within a known window.
45    pub max_context: usize,
46}
47
48impl Default for Opts {
49    fn default() -> Self {
50        Self {
51            max_context: usize::MAX,
52        }
53    }
54}
55
56/// Build the suffix array of `text` in memory and return it.
57///
58/// Generic over the symbol type `S` (`Ord + Copy`, e.g. `u8`, `u16`, `u32`)
59/// and the index type `I` (`u32`, `u64`, `usize`). Pick the narrowest `I`
60/// that can hold `text.len()`.
61///
62/// Produces a *standard lexicographic* suffix array. The "shorter suffix is
63/// smaller when one runs off the end of `text`" tie-break is applied — i.e.
64/// the algorithm behaves as if `text` is followed by an implicit symbol
65/// smaller than all of `S`.
66pub fn build_in_memory<S, I>(text: &[S]) -> Vec<I>
67where
68    S: Symbol,
69    I: Index,
70{
71    build_in_memory_with_opts(text, &Opts::default())
72}
73
74/// Variant of [`build_in_memory`] that accepts tuning options.
75pub fn build_in_memory_with_opts<S, I>(text: &[S], opts: &Opts) -> Vec<I>
76where
77    S: Symbol,
78    I: Index,
79{
80    build_in_memory_with(text, &PlainText::new(text.len()), opts)
81}
82
83/// Variant of [`build_in_memory`] that accepts a [`LimitProvider`].
84/// With [`PlainText`] this is identical to [`build_in_memory`]; with
85/// [`SegmentedText`][crate::limits::SegmentedText] the LCP scans stop
86/// at segment boundaries.
87pub fn build_in_memory_with<S, I, L>(text: &[S], lp: &L, opts: &Opts) -> Vec<I>
88where
89    S: Symbol,
90    I: Index,
91    L: LimitProvider,
92{
93    let n = text.len();
94    let positions: Vec<I> = (0..n).map(I::from_usize).collect();
95    build_in_memory_for_positions_with(text, positions, lp, opts)
96}
97
98/// Sort the caller-supplied `positions` by the lexicographic order of
99/// their suffixes in `text`. Returns the positions reordered so that
100/// `text[output[i]..]` is the i-th smallest suffix among the input set.
101///
102/// Equivalent to [`build_in_memory`] for the special case
103/// `positions = (0..text.len()).collect()`; the explicit-positions form
104/// lets callers skip suffixes they don't want included in the sort —
105/// e.g. STAR-style genome indexing where only ACGT-starting positions
106/// participate in the SA, avoiding the O(n) work of sorting and then
107/// discarding the spacer-starting positions inside bin-padding.
108///
109/// The suffix at each position is still the slice `text[position..]`;
110/// no positions are dropped from the input. To filter, the caller
111/// constructs `positions` with only the indices they want.
112pub fn build_in_memory_for_positions<S, I>(text: &[S], positions: Vec<I>) -> Vec<I>
113where
114    S: Symbol,
115    I: Index,
116{
117    build_in_memory_for_positions_with_opts(text, positions, &Opts::default())
118}
119
120/// Variant of [`build_in_memory_for_positions`] that accepts tuning options.
121pub fn build_in_memory_for_positions_with_opts<S, I>(
122    text: &[S],
123    positions: Vec<I>,
124    opts: &Opts,
125) -> Vec<I>
126where
127    S: Symbol,
128    I: Index,
129{
130    build_in_memory_for_positions_with(text, positions, &PlainText::new(text.len()), opts)
131}
132
133/// Variant of [`build_in_memory_for_positions`] that accepts both a
134/// [`LimitProvider`] (for segmented LCP truncation) and tuning options.
135/// With [`PlainText`] this is identical to
136/// [`build_in_memory_for_positions_with_opts`].
137pub fn build_in_memory_for_positions_with<S, I, L>(
138    text: &[S],
139    positions: Vec<I>,
140    lp: &L,
141    opts: &Opts,
142) -> Vec<I>
143where
144    S: Symbol,
145    I: Index,
146    L: LimitProvider,
147{
148    let n = positions.len();
149    if n == 0 {
150        return Vec::new();
151    }
152
153    let mut sa: Vec<I> = positions;
154    let mut sa_w: Vec<I> = vec![I::zero(); n];
155    let mut lcp_arr: Vec<I> = vec![I::zero(); n];
156    let mut lcp_w: Vec<I> = vec![I::zero(); n];
157
158    // Choose the LCP implementation once for the whole build; the captured
159    // function pointer travels through the recursion in a register, so the
160    // inner merge loop pays no atomic load or feature-detection branch.
161    let dispatch = LcpDispatch::detect();
162
163    merge_sort(
164        text,
165        lp,
166        &mut sa,
167        &mut sa_w,
168        &mut lcp_arr,
169        &mut lcp_w,
170        opts.max_context,
171        dispatch,
172    );
173
174    sa
175}
176
177/// Recursive merge-sort with LCP maintenance.
178///
179/// Pre: `sa.len() == sa_w.len() == lcp_arr.len() == lcp_w.len()`. The contents
180/// of `sa` are the suffix positions to sort (typically an identity
181/// permutation at the top level). All other buffers are scratch / output.
182///
183/// Post: `sa` is sorted in ascending lexicographic order on
184/// `text[sa[i]..]`; `lcp_arr[0] = 0` and `lcp_arr[i] = lcp(text[sa[i-1]..],
185/// text[sa[i]..])` for `i >= 1`.
186///
187/// Visible to the rest of the crate so the external-memory path can sort
188/// individual subarrays of positions using the same kernel.
189#[allow(clippy::too_many_arguments)] // 4 buffers + text + lp + ctx + dispatch
190pub(crate) fn merge_sort<S, I, L>(
191    text: &[S],
192    lp: &L,
193    sa: &mut [I],
194    sa_w: &mut [I],
195    lcp_arr: &mut [I],
196    lcp_w: &mut [I],
197    max_ctx: usize,
198    dispatch: LcpDispatch,
199) where
200    S: Symbol,
201    I: Index,
202    L: LimitProvider,
203{
204    let n = sa.len();
205    debug_assert_eq!(sa_w.len(), n);
206    debug_assert_eq!(lcp_arr.len(), n);
207    debug_assert_eq!(lcp_w.len(), n);
208
209    if n <= 1 {
210        if n == 1 {
211            lcp_arr[0] = I::zero();
212        }
213        return;
214    }
215
216    let mid = n / 2;
217    let (sa_l, sa_r) = sa.split_at_mut(mid);
218    let (sa_w_l, sa_w_r) = sa_w.split_at_mut(mid);
219    let (lcp_l, lcp_r) = lcp_arr.split_at_mut(mid);
220    let (lcp_w_l, lcp_w_r) = lcp_w.split_at_mut(mid);
221
222    join(
223        || merge_sort(text, lp, sa_l, sa_w_l, lcp_l, lcp_w_l, max_ctx, dispatch),
224        || merge_sort(text, lp, sa_r, sa_w_r, lcp_r, lcp_w_r, max_ctx, dispatch),
225    );
226
227    // Merge the two sorted halves (still living in `sa`) into the workspace,
228    // then copy the workspace back into the destination so the caller's
229    // postcondition holds on `sa` / `lcp_arr`.
230    merge(
231        text, lp, sa_l, sa_r, lcp_l, lcp_r, sa_w, lcp_w, max_ctx, dispatch,
232    );
233    sa.copy_from_slice(sa_w);
234    lcp_arr.copy_from_slice(lcp_w);
235}
236
237/// LCP-enhanced two-way merge of two sorted suffix arrays.
238///
239/// `x` / `lcp_x` and `y` / `lcp_y` must each be sorted with `lcp_*[0] == 0`
240/// and `lcp_*[i] = lcp(arr[i-1], arr[i])` for `i >= 1`. The result is written
241/// into `z` / `lcp_z` (length `x.len() + y.len()`).
242///
243/// Visible to the rest of the crate so the external-memory path can cascade
244/// 2-way merges across each partition's sub-subarrays during Phase 4.
245#[allow(clippy::too_many_arguments)] // CaPS-SA's merge takes 5 buffers + text + lp + ctx + dispatch
246pub(crate) fn merge<S, I, L>(
247    text: &[S],
248    lp: &L,
249    x: &[I],
250    y: &[I],
251    lcp_x: &[I],
252    lcp_y: &[I],
253    z: &mut [I],
254    lcp_z: &mut [I],
255    max_ctx: usize,
256    dispatch: LcpDispatch,
257) where
258    S: Symbol,
259    I: Index,
260    L: LimitProvider,
261{
262    let len_x = x.len();
263    let len_y = y.len();
264    debug_assert_eq!(z.len(), len_x + len_y);
265    debug_assert_eq!(lcp_z.len(), len_x + len_y);
266
267    if len_x == 0 {
268        z.copy_from_slice(y);
269        lcp_z.copy_from_slice(lcp_y);
270        return;
271    }
272    if len_y == 0 {
273        z.copy_from_slice(x);
274        lcp_z.copy_from_slice(lcp_x);
275        return;
276    }
277
278    // The "swap-on-output-from-B" trick from upstream CaPS-SA: we always
279    // label the stream we last output from as `A`, and the other as `B`. On
280    // entry no output has been produced yet, but the convention is consistent
281    // because both LCP arrays satisfy `lcp_*[0] == 0` and `m` starts at 0 —
282    // the first iteration falls into the `l_a == m` branch and computes the
283    // first comparison from scratch.
284    let mut arr_a: &[I] = x;
285    let mut arr_b: &[I] = y;
286    let mut lcp_a: &[I] = lcp_x;
287    let mut lcp_b: &[I] = lcp_y;
288    let mut len_a = len_x;
289    let mut len_b = len_y;
290    let mut i_a: usize = 0;
291    let mut i_b: usize = 0;
292    let mut m: usize = 0;
293    let mut k: usize = 0;
294    let mut lim_a_cache: Option<(usize, usize)> = None;
295    let mut lim_b_cache: Option<(usize, usize)> = None;
296
297    while i_a < len_a && i_b < len_b {
298        let l_a = lcp_a[i_a].to_usize();
299
300        // (output_a, lcp_for_output, new_m)
301        let (output_a, lcp_for_output, new_m) = if l_a > m {
302            (true, l_a, m)
303        } else if l_a < m {
304            (false, m, l_a)
305        } else {
306            // Tied — extend by an actual symbol scan from offset m.
307            let p_a = arr_a[i_a].to_usize();
308            let p_b = arr_b[i_b].to_usize();
309            // `lim_a` / `lim_b` are the per-suffix logical lengths from
310            // the `LimitProvider`. With `PlainText` these fold to
311            // `n_text - p` (the same expression the pre-LimitProvider
312            // code computed inline); with `SegmentedText` they cap at
313            // the next segment boundary so the LCP scan stops there.
314            let lim_a = match lim_a_cache {
315                Some((idx, lim)) if idx == i_a => lim,
316                _ => {
317                    let lim = lp.lim_at(p_a);
318                    lim_a_cache = Some((i_a, lim));
319                    lim
320                }
321            };
322            let lim_b = match lim_b_cache {
323                Some((idx, lim)) if idx == i_b => lim,
324                _ => {
325                    let lim = lp.lim_at(p_b);
326                    lim_b_cache = Some((i_b, lim));
327                    lim
328                }
329            };
330            // Pass an already-segmentation-aware cap to the SIMD LCP so
331            // it doesn't have to scan past the boundary. For PlainText
332            // this is equivalent to the LCP function's own `n - p`
333            // intersection — no extra work.
334            let cap = lim_a.min(lim_b).min(max_ctx);
335            let remaining_ctx = cap.saturating_sub(m);
336            let ext = dispatch.lcp(text, p_a + m, p_b + m, remaining_ctx);
337            let total = m + ext;
338            let a_smaller = if total < lim_a && total < lim_b {
339                text[p_a + total] < text[p_b + total]
340            } else {
341                // One or both suffixes ran off the end of their
342                // segment (or hit `max_ctx`). Defer to the
343                // [`LimitProvider`]'s boundary-ordering convention —
344                // for [`PlainText`] / standard `SegmentedText` this
345                // is `lim_a.cmp(&lim_b)` (shorter-is-smaller, the
346                // generalised-SA convention); custom impls can flip
347                // it (e.g. STAR's `spacer-as-largest`).
348                lp.boundary_order(p_a, lim_a, p_b, lim_b).is_lt()
349            };
350            (a_smaller, m, total)
351        };
352
353        if output_a {
354            z[k] = arr_a[i_a];
355            lcp_z[k] = I::from_usize(lcp_for_output);
356            i_a += 1;
357            lim_a_cache = None;
358        } else {
359            z[k] = arr_b[i_b];
360            lcp_z[k] = I::from_usize(lcp_for_output);
361            i_b += 1;
362            lim_b_cache = None;
363            // Outputting from B: swap labels so the next iteration's
364            // "last-output stream" is what was B.
365            std::mem::swap(&mut arr_a, &mut arr_b);
366            std::mem::swap(&mut lcp_a, &mut lcp_b);
367            std::mem::swap(&mut len_a, &mut len_b);
368            std::mem::swap(&mut i_a, &mut i_b);
369            std::mem::swap(&mut lim_a_cache, &mut lim_b_cache);
370        }
371        m = new_m;
372        k += 1;
373    }
374
375    // Drain the surviving stream. The first drained element carries the
376    // boundary LCP (`m`) connecting it to the last cross-stream output;
377    // subsequent drained elements use the source LCP array unchanged.
378    drain(arr_a, lcp_a, i_a, len_a, z, lcp_z, &mut k, m);
379    drain(arr_b, lcp_b, i_b, len_b, z, lcp_z, &mut k, m);
380}
381
382#[inline]
383#[allow(clippy::too_many_arguments)] // drain handles both source streams via labelled args
384fn drain<I: Index>(
385    arr: &[I],
386    lcp_src: &[I],
387    mut i: usize,
388    len: usize,
389    z: &mut [I],
390    lcp_z: &mut [I],
391    k: &mut usize,
392    boundary_m: usize,
393) {
394    let mut first = true;
395    while i < len {
396        z[*k] = arr[i];
397        lcp_z[*k] = if first {
398            I::from_usize(boundary_m)
399        } else {
400            lcp_src[i]
401        };
402        first = false;
403        i += 1;
404        *k += 1;
405    }
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411
412    /// Brute-force reference suffix array via `sort_by` over byte slices.
413    fn brute_force_sa(text: &[u8]) -> Vec<u32> {
414        let mut sa: Vec<u32> = (0..text.len() as u32).collect();
415        sa.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
416        sa
417    }
418
419    fn assert_matches_brute(text: &[u8]) {
420        let got: Vec<u32> = build_in_memory(text);
421        let want = brute_force_sa(text);
422        assert_eq!(got, want, "mismatch on text {text:?}");
423    }
424
425    #[test]
426    fn empty_text() {
427        let sa: Vec<u32> = build_in_memory::<u8, u32>(&[]);
428        assert!(sa.is_empty());
429    }
430
431    #[test]
432    fn single_symbol() {
433        let sa: Vec<u32> = build_in_memory(&[7u8]);
434        assert_eq!(sa, vec![0]);
435    }
436
437    #[test]
438    fn banana() {
439        assert_matches_brute(b"banana");
440    }
441
442    #[test]
443    fn mississippi() {
444        assert_matches_brute(b"mississippi");
445    }
446
447    #[test]
448    fn small_distinct_sentinel() {
449        // Alphabet 0..=5 with a unique terminator. Models the
450        // sentinel-transformed STAR text on a tiny example.
451        let text: Vec<u8> = vec![0, 1, 2, 0, 1, 5, 0, 2, 1, 6];
452        let got: Vec<u32> = build_in_memory(&text);
453        let want = brute_force_sa(&text);
454        assert_eq!(got, want);
455    }
456
457    #[test]
458    fn random_byte_texts() {
459        use rand::{RngExt, SeedableRng};
460        let mut rng = rand::rngs::StdRng::seed_from_u64(0xC0FFEE);
461        for &n in &[1usize, 2, 3, 7, 33, 200, 1000] {
462            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
463            let got: Vec<u32> = build_in_memory(&text);
464            let want = brute_force_sa(&text);
465            assert_eq!(got, want, "mismatch on random text len={n}");
466        }
467    }
468
469    #[test]
470    fn for_positions_full_set_matches_build_in_memory() {
471        // Same output as build_in_memory when positions is the identity.
472        let text = b"banana";
473        let want: Vec<u32> = build_in_memory(text);
474        let positions: Vec<u32> = (0..text.len() as u32).collect();
475        let got = build_in_memory_for_positions(text, positions);
476        assert_eq!(got, want);
477    }
478
479    #[test]
480    fn for_positions_subset_matches_brute_force() {
481        // Sort only the even positions of "mississippi" by their
482        // suffixes; verify against brute force.
483        let text = b"mississippi";
484        let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
485        let mut want = positions.clone();
486        want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
487        let got = build_in_memory_for_positions(text, positions);
488        assert_eq!(got, want);
489    }
490
491    #[test]
492    fn for_positions_random_subsets() {
493        use rand::{RngExt, SeedableRng};
494        let mut rng = rand::rngs::StdRng::seed_from_u64(0xFEED);
495        for &n in &[33usize, 200, 1000] {
496            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
497            // Random subset of positions.
498            let mut positions: Vec<u32> = (0..n as u32).collect();
499            // Drop a random ~30%.
500            positions.retain(|_| rng.random_range(0..10) < 7);
501            let mut want = positions.clone();
502            want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
503            let got = build_in_memory_for_positions(&text, positions);
504            assert_eq!(got, want, "subset sort mismatch n={n}");
505        }
506    }
507
508    #[test]
509    fn random_with_unique_terminator() {
510        // Distinct large terminator at the end — mimics the transform we'll
511        // apply for STAR.
512        use rand::{RngExt, SeedableRng};
513        let mut rng = rand::rngs::StdRng::seed_from_u64(0xBEEF);
514        for &n in &[1usize, 50, 500] {
515            let mut text: Vec<u8> = (0..n).map(|_| rng.random_range(0..5u8)).collect();
516            text.push(250); // unique max
517            let got: Vec<u32> = build_in_memory(&text);
518            let want = brute_force_sa(&text);
519            assert_eq!(got, want);
520        }
521    }
522
523    // ---- segmented SA tests ----
524
525    use crate::limits::SegmentedText;
526
527    /// Compare two suffixes under the segmented comparator (LCP
528    /// truncated at the boundary, "shorter-is-smaller" tie-break).
529    fn segmented_cmp(text: &[u8], lp: &SegmentedText, a: usize, b: usize) -> std::cmp::Ordering {
530        use crate::limits::LimitProvider;
531        let lim_a = lp.lim_at(a);
532        let lim_b = lp.lim_at(b);
533        let lim = lim_a.min(lim_b);
534        for i in 0..lim {
535            if text[a + i] != text[b + i] {
536                return text[a + i].cmp(&text[b + i]);
537            }
538        }
539        lim_a.cmp(&lim_b)
540    }
541
542    /// Assert that `sa` is a valid segmented SA over `text` partitioned
543    /// by `lengths`:
544    ///
545    /// 1. it's a permutation of the positions in `positions`, and
546    /// 2. every adjacent pair is in non-decreasing comparator order.
547    ///
548    /// Comparator-equivalent suffixes can appear in any relative order —
549    /// caps-sa's merge isn't a stable sort, so we don't pin a canonical
550    /// permutation.
551    fn assert_segmented_sa_valid(text: &[u8], lengths: &[usize], positions: &[u32], sa: &[u32]) {
552        let lp = SegmentedText::from_lengths(text.len(), lengths);
553        let mut expected = positions.to_vec();
554        expected.sort();
555        let mut got_sorted = sa.to_vec();
556        got_sorted.sort();
557        assert_eq!(got_sorted, expected, "sa is not a permutation of positions");
558        for w in sa.windows(2) {
559            let a = w[0] as usize;
560            let b = w[1] as usize;
561            let ord = segmented_cmp(text, &lp, a, b);
562            assert_ne!(
563                ord,
564                std::cmp::Ordering::Greater,
565                "out of order: pos {a} > pos {b} under segmented comparator",
566            );
567        }
568    }
569
570    #[test]
571    fn segmented_in_memory_matches_brute_force_small() {
572        // 4 segments: "hello" | "world" | "banana" | "mississippi"
573        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
574        let lengths = &[5usize, 5, 6, 11];
575        let lp = SegmentedText::from_lengths(text.len(), lengths);
576        let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
577        let all_positions: Vec<u32> = (0..text.len() as u32).collect();
578        assert_segmented_sa_valid(&text, lengths, &all_positions, &sa);
579    }
580
581    #[test]
582    fn segmented_single_segment_equals_unsegmented() {
583        // A single segment covering the whole text is the same as the
584        // non-segmented SA — confirms the LimitProvider path doesn't
585        // perturb the standard order when there's nothing to truncate.
586        let text = b"mississippi";
587        let lp = SegmentedText::from_lengths(text.len(), &[text.len()]);
588        let got_segmented: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
589        let got_plain: Vec<u32> = build_in_memory(text);
590        assert_eq!(got_segmented, got_plain);
591    }
592
593    #[test]
594    fn segmented_random_validity() {
595        use rand::{RngExt, SeedableRng};
596        let mut rng = rand::rngs::StdRng::seed_from_u64(0x5E6);
597        for _ in 0..20 {
598            let n_segments = rng.random_range(1..10usize);
599            let lengths: Vec<usize> = (0..n_segments)
600                .map(|_| rng.random_range(5..50usize))
601                .collect();
602            let n: usize = lengths.iter().sum();
603            // Small alphabet so the LCP-truncation case actually fires.
604            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
605            let lp = SegmentedText::from_lengths(n, &lengths);
606            let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
607            let all_positions: Vec<u32> = (0..n as u32).collect();
608            assert_segmented_sa_valid(&text, &lengths, &all_positions, &sa);
609        }
610    }
611
612    #[test]
613    fn segmented_for_positions_subset_validity() {
614        // Filter to even positions only, sort with segmentation.
615        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
616        let lengths = &[5usize, 5, 6, 11];
617        let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
618        let lp = SegmentedText::from_lengths(text.len(), lengths);
619        let sa =
620            build_in_memory_for_positions_with(&text, positions.clone(), &lp, &Opts::default());
621        assert_segmented_sa_valid(&text, lengths, &positions, &sa);
622    }
623
624    // ---- STAR-convention boundary_order tests ----
625
626    /// A `LimitProvider` wrapping [`SegmentedText`] with STAR's
627    /// `spacer-as-largest` boundary semantics: the suffix that hits
628    /// its limit first is *larger*, equivalently the longer-`lim`
629    /// suffix is smaller, with an ascending-position tie-break when
630    /// `lim_a == lim_b`. Used by rustar-aligner's `sa_build` to keep
631    /// byte-for-byte STAR compatibility on the segmented arm.
632    struct StarConvention {
633        inner: SegmentedText,
634    }
635
636    impl crate::limits::LimitProvider for StarConvention {
637        fn lim_at(&self, p: usize) -> usize {
638            self.inner.lim_at(p)
639        }
640        fn boundary_order(
641            &self,
642            p_a: usize,
643            lim_a: usize,
644            p_b: usize,
645            lim_b: usize,
646        ) -> std::cmp::Ordering {
647            lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
648        }
649    }
650
651    /// Brute-force SA under STAR's convention (longer-lim is smaller,
652    /// position tie-break). Used as the oracle for the differential
653    /// test. With a position tie-break the SA is uniquely determined,
654    /// so this can be compared with `assert_eq!`.
655    fn star_brute_force_sa(text: &[u8], lengths: &[usize]) -> Vec<u32> {
656        use crate::limits::LimitProvider;
657        let lp = SegmentedText::from_lengths(text.len(), lengths);
658        let mut sa: Vec<u32> = (0..text.len() as u32).collect();
659        sa.sort_by(|&a, &b| {
660            let pa = a as usize;
661            let pb = b as usize;
662            let lim_a = lp.lim_at(pa);
663            let lim_b = lp.lim_at(pb);
664            let lim = lim_a.min(lim_b);
665            for i in 0..lim {
666                if text[pa + i] != text[pb + i] {
667                    return text[pa + i].cmp(&text[pb + i]);
668                }
669            }
670            // STAR convention: longer-lim is smaller, then position.
671            lim_b.cmp(&lim_a).then(pa.cmp(&pb))
672        });
673        sa
674    }
675
676    #[test]
677    fn star_convention_matches_brute_force_small() {
678        // 4 segments: "hello" | "world" | "banana" | "mississippi"
679        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
680        let lengths = &[5usize, 5, 6, 11];
681        let lp = StarConvention {
682            inner: SegmentedText::from_lengths(text.len(), lengths),
683        };
684        let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
685        let want = star_brute_force_sa(&text, lengths);
686        assert_eq!(got, want, "STAR-convention SA mismatch");
687    }
688
689    /// Exercises the STAR-specific within-segment longer-is-smaller
690    /// case: in "AAAA" with one segment, STAR orders the longest
691    /// suffix first (`AAAA < AAA < AA < A`) — opposite of the
692    /// standard SA's `A < AA < AAA < AAAA`.
693    #[test]
694    fn star_convention_within_segment_longer_first() {
695        let text = b"AAAA";
696        let lp = StarConvention {
697            inner: SegmentedText::from_lengths(text.len(), &[text.len()]),
698        };
699        let got: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
700        // Position 0 = "AAAA" (lim 4), 1 = "AAA" (lim 3), 2 = "AA"
701        // (lim 2), 3 = "A" (lim 1). Longer-lim is smaller, so 0 < 1
702        // < 2 < 3.
703        assert_eq!(got, vec![0u32, 1, 2, 3]);
704    }
705
706    #[test]
707    fn star_convention_random_matches_brute_force() {
708        use rand::{RngExt, SeedableRng};
709        let mut rng = rand::rngs::StdRng::seed_from_u64(0xCAFE);
710        for _ in 0..20 {
711            let n_segments = rng.random_range(1..10usize);
712            let lengths: Vec<usize> = (0..n_segments)
713                .map(|_| rng.random_range(5..50usize))
714                .collect();
715            let n: usize = lengths.iter().sum();
716            // Small alphabet so the boundary-tie-break case fires.
717            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
718            let lp = StarConvention {
719                inner: SegmentedText::from_lengths(n, &lengths),
720            };
721            let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
722            let want = star_brute_force_sa(&text, &lengths);
723            assert_eq!(
724                got, want,
725                "STAR-convention SA mismatch (lengths={lengths:?}, text={text:?})",
726            );
727        }
728    }
729}
caps_sa/sample_sort.rs

caps_sa/
sample_sort.rs