caps_sa/
sample_sort.rs

1//! In-memory CaPS-SA-style suffix array construction.
2//!
3//! Phase 1 of the port: a parallel merge-sort with LCP-enhanced two-way merge,
4//! exactly the inner sorting kernel of upstream CaPS-SA's `Suffix_Array::merge`
5//! and `Suffix_Array::merge_sort` (see `include/Suffix_Array.hpp` and
6//! `src/Suffix_Array.cpp`). The sample-sort partitioning around this kernel
7//! (`select_pivots` → `distribute_sub_subarrays` → `merge_sub_subarrays`) is
8//! Phase 2 / 3 work; the kernel here already produces a correct LCP-annotated
9//! suffix array, and `rayon::join` gives parallel divide for free.
10//!
11//! The LCP-enhanced merge maintains:
12//!
13//! * `m` — the LCP between the last-output element and the current top of the
14//!   *other* stream.
15//! * `l_a` = `lcp_a[i_a]` — the LCP between the current top of the
16//!   last-output stream and its immediate predecessor (which is the
17//!   last-output element).
18//!
19//! Three cases per step:
20//!
21//! * `l_a > m`: the next candidate from the last-output stream agrees with the
22//!   last-output element past where the other stream diverged — it lies on the
23//!   same side of the other stream's top as the last-output element did, so it
24//!   wins. No symbol comparison needed.
25//! * `l_a < m`: the next candidate diverges from the last-output element
26//!   *inside* the prefix shared with the other stream's top. Since the stream
27//!   is sorted, the new candidate is larger than its predecessor; at the
28//!   divergence offset it therefore exceeds the other stream's top — the
29//!   other stream wins. No symbol comparison needed.
30//! * `l_a == m`: undetermined; extend the LCP from offset `m` by an actual
31//!   symbol scan and compare.
32
33use crate::Index;
34use crate::lcp::{LcpDispatch, Symbol};
35use crate::limits::{LimitProvider, PlainText};
36use rayon::join;
37
38/// Tunable options for SA construction.
39#[derive(Clone, Debug)]
40pub struct Opts {
41    /// Bound on extension comparisons inside the merge. `usize::MAX` (default)
42    /// is unbounded — required for full lexicographic correctness when the
43    /// caller's text doesn't guarantee comparisons terminate via sentinels
44    /// within a known window.
45    pub max_context: usize,
46}
47
48impl Default for Opts {
49    fn default() -> Self {
50        Self {
51            max_context: usize::MAX,
52        }
53    }
54}
55
56/// Build the suffix array of `text` in memory and return it.
57///
58/// Generic over the symbol type `S` (`Ord + Copy`, e.g. `u8`, `u16`, `u32`)
59/// and the index type `I` (`u32`, `u64`, `usize`). Pick the narrowest `I`
60/// that can hold `text.len()`.
61///
62/// Produces a *standard lexicographic* suffix array. The "shorter suffix is
63/// smaller when one runs off the end of `text`" tie-break is applied — i.e.
64/// the algorithm behaves as if `text` is followed by an implicit symbol
65/// smaller than all of `S`.
66pub fn build_in_memory<S, I>(text: &[S]) -> Vec<I>
67where
68    S: Symbol,
69    I: Index,
70{
71    build_in_memory_with_opts(text, &Opts::default())
72}
73
74/// Variant of [`build_in_memory`] that accepts tuning options.
75pub fn build_in_memory_with_opts<S, I>(text: &[S], opts: &Opts) -> Vec<I>
76where
77    S: Symbol,
78    I: Index,
79{
80    build_in_memory_with(text, &PlainText::new(text.len()), opts)
81}
82
83/// Variant of [`build_in_memory`] that accepts a [`LimitProvider`].
84/// With [`PlainText`] this is identical to [`build_in_memory`]; with
85/// [`SegmentedText`][crate::limits::SegmentedText] the LCP scans stop
86/// at segment boundaries.
87pub fn build_in_memory_with<S, I, L>(text: &[S], lp: &L, opts: &Opts) -> Vec<I>
88where
89    S: Symbol,
90    I: Index,
91    L: LimitProvider,
92{
93    let n = text.len();
94    let positions: Vec<I> = (0..n).map(I::from_usize).collect();
95    build_in_memory_for_positions_with(text, positions, lp, opts)
96}
97
98/// Sort the caller-supplied `positions` by the lexicographic order of
99/// their suffixes in `text`. Returns the positions reordered so that
100/// `text[output[i]..]` is the i-th smallest suffix among the input set.
101///
102/// Equivalent to [`build_in_memory`] for the special case
103/// `positions = (0..text.len()).collect()`; the explicit-positions form
104/// lets callers skip suffixes they don't want included in the sort —
105/// e.g. STAR-style genome indexing where only ACGT-starting positions
106/// participate in the SA, avoiding the O(n) work of sorting and then
107/// discarding the spacer-starting positions inside bin-padding.
108///
109/// The suffix at each position is still the slice `text[position..]`;
110/// no positions are dropped from the input. To filter, the caller
111/// constructs `positions` with only the indices they want.
112pub fn build_in_memory_for_positions<S, I>(text: &[S], positions: Vec<I>) -> Vec<I>
113where
114    S: Symbol,
115    I: Index,
116{
117    build_in_memory_for_positions_with_opts(text, positions, &Opts::default())
118}
119
120/// Variant of [`build_in_memory_for_positions`] that accepts tuning options.
121pub fn build_in_memory_for_positions_with_opts<S, I>(
122    text: &[S],
123    positions: Vec<I>,
124    opts: &Opts,
125) -> Vec<I>
126where
127    S: Symbol,
128    I: Index,
129{
130    build_in_memory_for_positions_with(text, positions, &PlainText::new(text.len()), opts)
131}
132
133/// Variant of [`build_in_memory_for_positions`] that accepts both a
134/// [`LimitProvider`] (for segmented LCP truncation) and tuning options.
135/// With [`PlainText`] this is identical to
136/// [`build_in_memory_for_positions_with_opts`].
137pub fn build_in_memory_for_positions_with<S, I, L>(
138    text: &[S],
139    positions: Vec<I>,
140    lp: &L,
141    opts: &Opts,
142) -> Vec<I>
143where
144    S: Symbol,
145    I: Index,
146    L: LimitProvider,
147{
148    let n = positions.len();
149    if n == 0 {
150        return Vec::new();
151    }
152
153    let mut sa: Vec<I> = positions;
154    let mut sa_w: Vec<I> = vec![I::zero(); n];
155    let mut lcp_arr: Vec<I> = vec![I::zero(); n];
156    let mut lcp_w: Vec<I> = vec![I::zero(); n];
157
158    // Choose the LCP implementation once for the whole build; the captured
159    // function pointer travels through the recursion in a register, so the
160    // inner merge loop pays no atomic load or feature-detection branch.
161    let dispatch = LcpDispatch::detect();
162
163    merge_sort(
164        text,
165        lp,
166        &mut sa,
167        &mut sa_w,
168        &mut lcp_arr,
169        &mut lcp_w,
170        opts.max_context,
171        dispatch,
172    );
173
174    sa
175}
176
177/// Recursive merge-sort with LCP maintenance.
178///
179/// Pre: `sa.len() == sa_w.len() == lcp_arr.len() == lcp_w.len()`. The contents
180/// of `sa` are the suffix positions to sort (typically an identity
181/// permutation at the top level). All other buffers are scratch / output.
182///
183/// Post: `sa` is sorted in ascending lexicographic order on
184/// `text[sa[i]..]`; `lcp_arr[0] = 0` and `lcp_arr[i] = lcp(text[sa[i-1]..],
185/// text[sa[i]..])` for `i >= 1`.
186///
187/// Visible to the rest of the crate so the external-memory path can sort
188/// individual subarrays of positions using the same kernel.
189#[allow(clippy::too_many_arguments)] // 4 buffers + text + lp + ctx + dispatch
190pub(crate) fn merge_sort<S, I, L>(
191    text: &[S],
192    lp: &L,
193    sa: &mut [I],
194    sa_w: &mut [I],
195    lcp_arr: &mut [I],
196    lcp_w: &mut [I],
197    max_ctx: usize,
198    dispatch: LcpDispatch,
199) where
200    S: Symbol,
201    I: Index,
202    L: LimitProvider,
203{
204    let n = sa.len();
205    debug_assert_eq!(sa_w.len(), n);
206    debug_assert_eq!(lcp_arr.len(), n);
207    debug_assert_eq!(lcp_w.len(), n);
208
209    if n <= 1 {
210        if n == 1 {
211            lcp_arr[0] = I::zero();
212        }
213        return;
214    }
215
216    let mid = n / 2;
217    let (sa_l, sa_r) = sa.split_at_mut(mid);
218    let (sa_w_l, sa_w_r) = sa_w.split_at_mut(mid);
219    let (lcp_l, lcp_r) = lcp_arr.split_at_mut(mid);
220    let (lcp_w_l, lcp_w_r) = lcp_w.split_at_mut(mid);
221
222    join(
223        || merge_sort(text, lp, sa_l, sa_w_l, lcp_l, lcp_w_l, max_ctx, dispatch),
224        || merge_sort(text, lp, sa_r, sa_w_r, lcp_r, lcp_w_r, max_ctx, dispatch),
225    );
226
227    // Merge the two sorted halves (still living in `sa`) into the workspace,
228    // then copy the workspace back into the destination so the caller's
229    // postcondition holds on `sa` / `lcp_arr`.
230    merge(
231        text, lp, sa_l, sa_r, lcp_l, lcp_r, sa_w, lcp_w, max_ctx, dispatch,
232    );
233    sa.copy_from_slice(sa_w);
234    lcp_arr.copy_from_slice(lcp_w);
235}
236
237/// LCP-enhanced two-way merge of two sorted suffix arrays.
238///
239/// `x` / `lcp_x` and `y` / `lcp_y` must each be sorted with `lcp_*[0] == 0`
240/// and `lcp_*[i] = lcp(arr[i-1], arr[i])` for `i >= 1`. The result is written
241/// into `z` / `lcp_z` (length `x.len() + y.len()`).
242///
243/// Visible to the rest of the crate so the external-memory path can cascade
244/// 2-way merges across each partition's sub-subarrays during Phase 4.
245#[allow(clippy::too_many_arguments)] // CaPS-SA's merge takes 5 buffers + text + lp + ctx + dispatch
246pub(crate) fn merge<S, I, L>(
247    text: &[S],
248    lp: &L,
249    x: &[I],
250    y: &[I],
251    lcp_x: &[I],
252    lcp_y: &[I],
253    z: &mut [I],
254    lcp_z: &mut [I],
255    max_ctx: usize,
256    dispatch: LcpDispatch,
257) where
258    S: Symbol,
259    I: Index,
260    L: LimitProvider,
261{
262    let len_x = x.len();
263    let len_y = y.len();
264    debug_assert_eq!(z.len(), len_x + len_y);
265    debug_assert_eq!(lcp_z.len(), len_x + len_y);
266
267    if len_x == 0 {
268        z.copy_from_slice(y);
269        lcp_z.copy_from_slice(lcp_y);
270        return;
271    }
272    if len_y == 0 {
273        z.copy_from_slice(x);
274        lcp_z.copy_from_slice(lcp_x);
275        return;
276    }
277
278    // The "swap-on-output-from-B" trick from upstream CaPS-SA: we always
279    // label the stream we last output from as `A`, and the other as `B`. On
280    // entry no output has been produced yet, but the convention is consistent
281    // because both LCP arrays satisfy `lcp_*[0] == 0` and `m` starts at 0 —
282    // the first iteration falls into the `l_a == m` branch and computes the
283    // first comparison from scratch.
284    let mut arr_a: &[I] = x;
285    let mut arr_b: &[I] = y;
286    let mut lcp_a: &[I] = lcp_x;
287    let mut lcp_b: &[I] = lcp_y;
288    let mut len_a = len_x;
289    let mut len_b = len_y;
290    let mut i_a: usize = 0;
291    let mut i_b: usize = 0;
292    let mut m: usize = 0;
293    let mut k: usize = 0;
294
295    while i_a < len_a && i_b < len_b {
296        let l_a = lcp_a[i_a].to_usize();
297
298        // (output_a, lcp_for_output, new_m)
299        let (output_a, lcp_for_output, new_m) = if l_a > m {
300            (true, l_a, m)
301        } else if l_a < m {
302            (false, m, l_a)
303        } else {
304            // Tied — extend by an actual symbol scan from offset m.
305            let p_a = arr_a[i_a].to_usize();
306            let p_b = arr_b[i_b].to_usize();
307            // `lim_a` / `lim_b` are the per-suffix logical lengths from
308            // the `LimitProvider`. With `PlainText` these fold to
309            // `n_text - p` (the same expression the pre-LimitProvider
310            // code computed inline); with `SegmentedText` they cap at
311            // the next segment boundary so the LCP scan stops there.
312            let lim_a = lp.lim_at(p_a);
313            let lim_b = lp.lim_at(p_b);
314            // Pass an already-segmentation-aware cap to the SIMD LCP so
315            // it doesn't have to scan past the boundary. For PlainText
316            // this is equivalent to the LCP function's own `n - p`
317            // intersection — no extra work.
318            let cap = lim_a.min(lim_b).min(max_ctx);
319            let remaining_ctx = cap.saturating_sub(m);
320            let ext = dispatch.lcp(text, p_a + m, p_b + m, remaining_ctx);
321            let total = m + ext;
322            let a_smaller = if total < lim_a && total < lim_b {
323                text[p_a + total] < text[p_b + total]
324            } else {
325                // One or both suffixes ran off the end of their
326                // segment (or hit `max_ctx`). Defer to the
327                // [`LimitProvider`]'s boundary-ordering convention —
328                // for [`PlainText`] / standard `SegmentedText` this
329                // is `lim_a.cmp(&lim_b)` (shorter-is-smaller, the
330                // generalised-SA convention); custom impls can flip
331                // it (e.g. STAR's `spacer-as-largest`).
332                lp.boundary_order(p_a, lim_a, p_b, lim_b).is_lt()
333            };
334            (a_smaller, m, total)
335        };
336
337        if output_a {
338            z[k] = arr_a[i_a];
339            lcp_z[k] = I::from_usize(lcp_for_output);
340            i_a += 1;
341        } else {
342            z[k] = arr_b[i_b];
343            lcp_z[k] = I::from_usize(lcp_for_output);
344            i_b += 1;
345            // Outputting from B: swap labels so the next iteration's
346            // "last-output stream" is what was B.
347            std::mem::swap(&mut arr_a, &mut arr_b);
348            std::mem::swap(&mut lcp_a, &mut lcp_b);
349            std::mem::swap(&mut len_a, &mut len_b);
350            std::mem::swap(&mut i_a, &mut i_b);
351        }
352        m = new_m;
353        k += 1;
354    }
355
356    // Drain the surviving stream. The first drained element carries the
357    // boundary LCP (`m`) connecting it to the last cross-stream output;
358    // subsequent drained elements use the source LCP array unchanged.
359    drain(arr_a, lcp_a, i_a, len_a, z, lcp_z, &mut k, m);
360    drain(arr_b, lcp_b, i_b, len_b, z, lcp_z, &mut k, m);
361}
362
363#[inline]
364#[allow(clippy::too_many_arguments)] // drain handles both source streams via labelled args
365fn drain<I: Index>(
366    arr: &[I],
367    lcp_src: &[I],
368    mut i: usize,
369    len: usize,
370    z: &mut [I],
371    lcp_z: &mut [I],
372    k: &mut usize,
373    boundary_m: usize,
374) {
375    let mut first = true;
376    while i < len {
377        z[*k] = arr[i];
378        lcp_z[*k] = if first {
379            I::from_usize(boundary_m)
380        } else {
381            lcp_src[i]
382        };
383        first = false;
384        i += 1;
385        *k += 1;
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392
393    /// Brute-force reference suffix array via `sort_by` over byte slices.
394    fn brute_force_sa(text: &[u8]) -> Vec<u32> {
395        let mut sa: Vec<u32> = (0..text.len() as u32).collect();
396        sa.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
397        sa
398    }
399
400    fn assert_matches_brute(text: &[u8]) {
401        let got: Vec<u32> = build_in_memory(text);
402        let want = brute_force_sa(text);
403        assert_eq!(got, want, "mismatch on text {text:?}");
404    }
405
406    #[test]
407    fn empty_text() {
408        let sa: Vec<u32> = build_in_memory::<u8, u32>(&[]);
409        assert!(sa.is_empty());
410    }
411
412    #[test]
413    fn single_symbol() {
414        let sa: Vec<u32> = build_in_memory(&[7u8]);
415        assert_eq!(sa, vec![0]);
416    }
417
418    #[test]
419    fn banana() {
420        assert_matches_brute(b"banana");
421    }
422
423    #[test]
424    fn mississippi() {
425        assert_matches_brute(b"mississippi");
426    }
427
428    #[test]
429    fn small_distinct_sentinel() {
430        // Alphabet 0..=5 with a unique terminator. Models the
431        // sentinel-transformed STAR text on a tiny example.
432        let text: Vec<u8> = vec![0, 1, 2, 0, 1, 5, 0, 2, 1, 6];
433        let got: Vec<u32> = build_in_memory(&text);
434        let want = brute_force_sa(&text);
435        assert_eq!(got, want);
436    }
437
438    #[test]
439    fn random_byte_texts() {
440        use rand::{RngExt, SeedableRng};
441        let mut rng = rand::rngs::StdRng::seed_from_u64(0xC0FFEE);
442        for &n in &[1usize, 2, 3, 7, 33, 200, 1000] {
443            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
444            let got: Vec<u32> = build_in_memory(&text);
445            let want = brute_force_sa(&text);
446            assert_eq!(got, want, "mismatch on random text len={n}");
447        }
448    }
449
450    #[test]
451    fn for_positions_full_set_matches_build_in_memory() {
452        // Same output as build_in_memory when positions is the identity.
453        let text = b"banana";
454        let want: Vec<u32> = build_in_memory(text);
455        let positions: Vec<u32> = (0..text.len() as u32).collect();
456        let got = build_in_memory_for_positions(text, positions);
457        assert_eq!(got, want);
458    }
459
460    #[test]
461    fn for_positions_subset_matches_brute_force() {
462        // Sort only the even positions of "mississippi" by their
463        // suffixes; verify against brute force.
464        let text = b"mississippi";
465        let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
466        let mut want = positions.clone();
467        want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
468        let got = build_in_memory_for_positions(text, positions);
469        assert_eq!(got, want);
470    }
471
472    #[test]
473    fn for_positions_random_subsets() {
474        use rand::{RngExt, SeedableRng};
475        let mut rng = rand::rngs::StdRng::seed_from_u64(0xFEED);
476        for &n in &[33usize, 200, 1000] {
477            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
478            // Random subset of positions.
479            let mut positions: Vec<u32> = (0..n as u32).collect();
480            // Drop a random ~30%.
481            positions.retain(|_| rng.random_range(0..10) < 7);
482            let mut want = positions.clone();
483            want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
484            let got = build_in_memory_for_positions(&text, positions);
485            assert_eq!(got, want, "subset sort mismatch n={n}");
486        }
487    }
488
489    #[test]
490    fn random_with_unique_terminator() {
491        // Distinct large terminator at the end — mimics the transform we'll
492        // apply for STAR.
493        use rand::{RngExt, SeedableRng};
494        let mut rng = rand::rngs::StdRng::seed_from_u64(0xBEEF);
495        for &n in &[1usize, 50, 500] {
496            let mut text: Vec<u8> = (0..n).map(|_| rng.random_range(0..5u8)).collect();
497            text.push(250); // unique max
498            let got: Vec<u32> = build_in_memory(&text);
499            let want = brute_force_sa(&text);
500            assert_eq!(got, want);
501        }
502    }
503
504    // ---- segmented SA tests ----
505
506    use crate::limits::SegmentedText;
507
508    /// Compare two suffixes under the segmented comparator (LCP
509    /// truncated at the boundary, "shorter-is-smaller" tie-break).
510    fn segmented_cmp(text: &[u8], lp: &SegmentedText, a: usize, b: usize) -> std::cmp::Ordering {
511        use crate::limits::LimitProvider;
512        let lim_a = lp.lim_at(a);
513        let lim_b = lp.lim_at(b);
514        let lim = lim_a.min(lim_b);
515        for i in 0..lim {
516            if text[a + i] != text[b + i] {
517                return text[a + i].cmp(&text[b + i]);
518            }
519        }
520        lim_a.cmp(&lim_b)
521    }
522
523    /// Assert that `sa` is a valid segmented SA over `text` partitioned
524    /// by `lengths`:
525    ///
526    /// 1. it's a permutation of the positions in `positions`, and
527    /// 2. every adjacent pair is in non-decreasing comparator order.
528    ///
529    /// Comparator-equivalent suffixes can appear in any relative order —
530    /// caps-sa's merge isn't a stable sort, so we don't pin a canonical
531    /// permutation.
532    fn assert_segmented_sa_valid(text: &[u8], lengths: &[usize], positions: &[u32], sa: &[u32]) {
533        let lp = SegmentedText::from_lengths(text.len(), lengths);
534        let mut expected = positions.to_vec();
535        expected.sort();
536        let mut got_sorted = sa.to_vec();
537        got_sorted.sort();
538        assert_eq!(got_sorted, expected, "sa is not a permutation of positions");
539        for w in sa.windows(2) {
540            let a = w[0] as usize;
541            let b = w[1] as usize;
542            let ord = segmented_cmp(text, &lp, a, b);
543            assert_ne!(
544                ord,
545                std::cmp::Ordering::Greater,
546                "out of order: pos {a} > pos {b} under segmented comparator",
547            );
548        }
549    }
550
551    #[test]
552    fn segmented_in_memory_matches_brute_force_small() {
553        // 4 segments: "hello" | "world" | "banana" | "mississippi"
554        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
555        let lengths = &[5usize, 5, 6, 11];
556        let lp = SegmentedText::from_lengths(text.len(), lengths);
557        let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
558        let all_positions: Vec<u32> = (0..text.len() as u32).collect();
559        assert_segmented_sa_valid(&text, lengths, &all_positions, &sa);
560    }
561
562    #[test]
563    fn segmented_single_segment_equals_unsegmented() {
564        // A single segment covering the whole text is the same as the
565        // non-segmented SA — confirms the LimitProvider path doesn't
566        // perturb the standard order when there's nothing to truncate.
567        let text = b"mississippi";
568        let lp = SegmentedText::from_lengths(text.len(), &[text.len()]);
569        let got_segmented: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
570        let got_plain: Vec<u32> = build_in_memory(text);
571        assert_eq!(got_segmented, got_plain);
572    }
573
574    #[test]
575    fn segmented_random_validity() {
576        use rand::{RngExt, SeedableRng};
577        let mut rng = rand::rngs::StdRng::seed_from_u64(0x5E6);
578        for _ in 0..20 {
579            let n_segments = rng.random_range(1..10usize);
580            let lengths: Vec<usize> = (0..n_segments)
581                .map(|_| rng.random_range(5..50usize))
582                .collect();
583            let n: usize = lengths.iter().sum();
584            // Small alphabet so the LCP-truncation case actually fires.
585            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
586            let lp = SegmentedText::from_lengths(n, &lengths);
587            let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
588            let all_positions: Vec<u32> = (0..n as u32).collect();
589            assert_segmented_sa_valid(&text, &lengths, &all_positions, &sa);
590        }
591    }
592
593    #[test]
594    fn segmented_for_positions_subset_validity() {
595        // Filter to even positions only, sort with segmentation.
596        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
597        let lengths = &[5usize, 5, 6, 11];
598        let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
599        let lp = SegmentedText::from_lengths(text.len(), lengths);
600        let sa =
601            build_in_memory_for_positions_with(&text, positions.clone(), &lp, &Opts::default());
602        assert_segmented_sa_valid(&text, lengths, &positions, &sa);
603    }
604
605    // ---- STAR-convention boundary_order tests ----
606
607    /// A `LimitProvider` wrapping [`SegmentedText`] with STAR's
608    /// `spacer-as-largest` boundary semantics: the suffix that hits
609    /// its limit first is *larger*, equivalently the longer-`lim`
610    /// suffix is smaller, with an ascending-position tie-break when
611    /// `lim_a == lim_b`. Used by rustar-aligner's `sa_build` to keep
612    /// byte-for-byte STAR compatibility on the segmented arm.
613    struct StarConvention {
614        inner: SegmentedText,
615    }
616
617    impl crate::limits::LimitProvider for StarConvention {
618        fn lim_at(&self, p: usize) -> usize {
619            self.inner.lim_at(p)
620        }
621        fn boundary_order(
622            &self,
623            p_a: usize,
624            lim_a: usize,
625            p_b: usize,
626            lim_b: usize,
627        ) -> std::cmp::Ordering {
628            lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
629        }
630    }
631
632    /// Brute-force SA under STAR's convention (longer-lim is smaller,
633    /// position tie-break). Used as the oracle for the differential
634    /// test. With a position tie-break the SA is uniquely determined,
635    /// so this can be compared with `assert_eq!`.
636    fn star_brute_force_sa(text: &[u8], lengths: &[usize]) -> Vec<u32> {
637        use crate::limits::LimitProvider;
638        let lp = SegmentedText::from_lengths(text.len(), lengths);
639        let mut sa: Vec<u32> = (0..text.len() as u32).collect();
640        sa.sort_by(|&a, &b| {
641            let pa = a as usize;
642            let pb = b as usize;
643            let lim_a = lp.lim_at(pa);
644            let lim_b = lp.lim_at(pb);
645            let lim = lim_a.min(lim_b);
646            for i in 0..lim {
647                if text[pa + i] != text[pb + i] {
648                    return text[pa + i].cmp(&text[pb + i]);
649                }
650            }
651            // STAR convention: longer-lim is smaller, then position.
652            lim_b.cmp(&lim_a).then(pa.cmp(&pb))
653        });
654        sa
655    }
656
657    #[test]
658    fn star_convention_matches_brute_force_small() {
659        // 4 segments: "hello" | "world" | "banana" | "mississippi"
660        let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
661        let lengths = &[5usize, 5, 6, 11];
662        let lp = StarConvention {
663            inner: SegmentedText::from_lengths(text.len(), lengths),
664        };
665        let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
666        let want = star_brute_force_sa(&text, lengths);
667        assert_eq!(got, want, "STAR-convention SA mismatch");
668    }
669
670    /// Exercises the STAR-specific within-segment longer-is-smaller
671    /// case: in "AAAA" with one segment, STAR orders the longest
672    /// suffix first (`AAAA < AAA < AA < A`) — opposite of the
673    /// standard SA's `A < AA < AAA < AAAA`.
674    #[test]
675    fn star_convention_within_segment_longer_first() {
676        let text = b"AAAA";
677        let lp = StarConvention {
678            inner: SegmentedText::from_lengths(text.len(), &[text.len()]),
679        };
680        let got: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
681        // Position 0 = "AAAA" (lim 4), 1 = "AAA" (lim 3), 2 = "AA"
682        // (lim 2), 3 = "A" (lim 1). Longer-lim is smaller, so 0 < 1
683        // < 2 < 3.
684        assert_eq!(got, vec![0u32, 1, 2, 3]);
685    }
686
687    #[test]
688    fn star_convention_random_matches_brute_force() {
689        use rand::{RngExt, SeedableRng};
690        let mut rng = rand::rngs::StdRng::seed_from_u64(0xCAFE);
691        for _ in 0..20 {
692            let n_segments = rng.random_range(1..10usize);
693            let lengths: Vec<usize> = (0..n_segments)
694                .map(|_| rng.random_range(5..50usize))
695                .collect();
696            let n: usize = lengths.iter().sum();
697            // Small alphabet so the boundary-tie-break case fires.
698            let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
699            let lp = StarConvention {
700                inner: SegmentedText::from_lengths(n, &lengths),
701            };
702            let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
703            let want = star_brute_force_sa(&text, &lengths);
704            assert_eq!(
705                got, want,
706                "STAR-convention SA mismatch (lengths={lengths:?}, text={text:?})",
707            );
708        }
709    }
710}
caps_sa/sample_sort.rs

caps_sa/
sample_sort.rs