Skip to main content

caps_sa/
limits.rs

1//! Per-suffix length providers for segmented suffix-array construction.
2//!
3//! In the standard SA construction the "natural length" of the suffix
4//! starting at position `p` is `text.len() - p`. For *segmented* texts
5//! (multi-string SAs, splice-junction indexes, etc.) we want LCP
6//! comparisons to stop at the next segment boundary instead — the
7//! suffix logically ends there, and the merge resolves cross-segment
8//! ordering by "shorter-suffix-is-smaller" (the standard generalised-SA
9//! convention).
10//!
11//! The [`LimitProvider`] trait abstracts the per-suffix length lookup
12//! and is plumbed through every site in `merge` / `cascade_merge` /
13//! `suffix_cmp` that previously computed `n - p` inline.
14//! [`PlainText`] is the zero-cost default — its `lim_at` is
15//! `#[inline(always)]` and folds to the same `n - p` expression the
16//! current code emits, so the non-segmented path generates **bit-
17//! identical assembly** to today's after monomorphization.
18//! [`SegmentedText`] holds a sorted cumulative-ends `Vec<u64>` and
19//! does a `partition_point` per lookup; the merge can cache the
20//! result across LCP calls so the cost amortises to ~one binary
21//! search per output record.
22//!
23//! See `bench/README.md` "Approach 3 — segmented LCP" for the design
24//! rationale and the comparison against the `[u8; 3]` (24-bit-text)
25//! alternative.
26
27/// Per-suffix length provider. The merge and cascade-merge code use
28/// `lp.lim_at(p)` instead of `text.len() - p`; the LCP function itself
29/// is unchanged (the merge passes the appropriately-capped
30/// `max_ctx` to the existing SIMD path).
31///
32/// Implementations must be `Sync` so the rayon-parallel sort can
33/// share one provider across worker threads.
34pub trait LimitProvider: Sync {
35    /// Logical length of the suffix starting at position `p` in
36    /// symbols — i.e. the number of comparable symbols before the
37    /// next segment boundary or end-of-text. Must be at most
38    /// `text.len() - p`.
39    fn lim_at(&self, p: usize) -> usize;
40
41    /// Order to resolve when one or both suffixes hit their boundary
42    /// before any byte of their shared prefix differs. The default
43    /// is `lim_a.cmp(&lim_b)` — "shorter-suffix-is-smaller", the
44    /// standard generalised-SA / multi-string-SA convention, what a
45    /// `Vec<&str>` sort with `&str` ordering produces.
46    ///
47    /// Custom impls can override for different boundary conventions.
48    /// The motivating example is STAR's `spacer-as-largest` ordering:
49    /// the suffix that hits a spacer first is *larger*, equivalently
50    /// the longer-`lim` one is smaller, with an ascending-position
51    /// tie-break when both `lim`s coincide:
52    ///
53    /// ```ignore
54    /// fn boundary_order(&self, p_a: usize, lim_a: usize,
55    ///                   p_b: usize, lim_b: usize) -> Ordering {
56    ///     lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
57    /// }
58    /// ```
59    ///
60    /// `p_a` / `p_b` are the suffix start positions in the same
61    /// coordinate space the merge sees (the spacer-free text's
62    /// coordinates when invoked through `*_with` entries on a
63    /// rustar-aligner-style spacer-free text). The default impl
64    /// ignores them; impls that want a position tie-break use them.
65    #[inline]
66    fn boundary_order(
67        &self,
68        p_a: usize,
69        lim_a: usize,
70        p_b: usize,
71        lim_b: usize,
72    ) -> std::cmp::Ordering {
73        let _ = (p_a, p_b);
74        lim_a.cmp(&lim_b)
75    }
76}
77
78/// Default provider for non-segmented texts: `lim_at(p) = n - p`.
79/// Stored as a single `usize`; the `#[inline(always)]` `lim_at`
80/// folds at monomorphization time into the same `n - p` the merge
81/// used before this abstraction existed, so non-segmented callers
82/// pay zero overhead.
83#[derive(Copy, Clone, Debug)]
84pub struct PlainText {
85    /// Total text length in symbols.
86    pub n: usize,
87}
88
89impl PlainText {
90    /// New `PlainText` for a text of `n` symbols.
91    #[inline]
92    pub fn new(n: usize) -> Self {
93        Self { n }
94    }
95}
96
97impl LimitProvider for PlainText {
98    #[inline(always)]
99    fn lim_at(&self, p: usize) -> usize {
100        self.n - p
101    }
102}
103
104/// Provider for texts partitioned into segments at known cumulative
105/// end positions. `lim_at(p)` binary-searches the sorted ends list
106/// and returns the distance from `p` to the next boundary.
107///
108/// Storage cost is `8 × n_segments` bytes (the cumulative-ends
109/// `Vec<u64>`). For a 50 K-junction SA index on a 6 GB genome that
110/// is **400 KB total** — vs the 750 MB a packed bitmap would need,
111/// and the 6 GB an extra-byte-per-symbol u16 text would need.
112///
113/// Lookup is `O(log n_segments)` — a few cycles for typical
114/// segment counts. The merge can cache `lim_p`/`lim_q` across LCP
115/// calls so the cost amortises to ~one binary search per output
116/// record.
117///
118/// Two constructors:
119/// - [`from_lengths`][Self::from_lengths] takes per-segment lengths
120///   and builds the cumulative-ends list internally. Most ergonomic
121///   when the caller has `[chr_len_0, chr_len_1, …]` already.
122/// - [`from_ends`][Self::from_ends] takes the sorted cumulative
123///   ends directly. Useful when the caller already has them — e.g.
124///   STAR's `chr_start[]` table.
125///
126/// Both constructors require the segments to cover the whole text
127/// (`sum(lengths) == text_len`, or `ends.last() == Some(text_len)`).
128#[derive(Clone, Debug)]
129pub struct SegmentedText {
130    n: usize,
131    /// Sorted, strictly-increasing cumulative end positions. After
132    /// segment 0 of length 100 ends at index 100, `ends[0] = 100`.
133    /// After segment 1 of length 50 (positions 100..150),
134    /// `ends[1] = 150`. The last entry equals the total text length.
135    ends: Vec<u64>,
136}
137
138impl SegmentedText {
139    /// Build from per-segment lengths. The sum must equal `text_len`.
140    pub fn from_lengths(text_len: usize, lengths: &[usize]) -> Self {
141        let mut ends = Vec::with_capacity(lengths.len());
142        let mut cum: u64 = 0;
143        for &len in lengths {
144            cum += len as u64;
145            ends.push(cum);
146        }
147        assert_eq!(
148            cum as usize, text_len,
149            "SegmentedText::from_lengths: per-segment lengths sum to {cum} but text_len is {text_len}",
150        );
151        Self { n: text_len, ends }
152    }
153
154    /// Build from sorted, strictly-increasing cumulative end positions.
155    /// `ends.last()` must equal `text_len`.
156    pub fn from_ends(text_len: usize, ends: Vec<u64>) -> Self {
157        assert!(
158            ends.windows(2).all(|w| w[0] < w[1]),
159            "SegmentedText::from_ends: ends must be strictly increasing",
160        );
161        match ends.last() {
162            Some(&last) => assert_eq!(
163                last as usize, text_len,
164                "SegmentedText::from_ends: last end ({last}) != text_len ({text_len})",
165            ),
166            None => assert_eq!(
167                text_len, 0,
168                "SegmentedText::from_ends: empty ends but text_len ({text_len}) != 0",
169            ),
170        }
171        Self { n: text_len, ends }
172    }
173
174    /// Total text length in symbols.
175    #[inline]
176    pub fn text_len(&self) -> usize {
177        self.n
178    }
179
180    /// Number of segments.
181    #[inline]
182    pub fn n_segments(&self) -> usize {
183        self.ends.len()
184    }
185
186    /// Cumulative end positions, sorted, strictly increasing.
187    /// `ends()[i]` is the position one past the last symbol of
188    /// segment `i`.
189    #[inline]
190    pub fn ends(&self) -> &[u64] {
191        &self.ends
192    }
193}
194
195impl LimitProvider for SegmentedText {
196    #[inline]
197    fn lim_at(&self, p: usize) -> usize {
198        // First boundary strictly greater than p.
199        let i = self.ends.partition_point(|&b| b <= p as u64);
200        if i < self.ends.len() {
201            self.ends[i] as usize - p
202        } else {
203            // p past the last boundary: just text-end.
204            self.n - p
205        }
206    }
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212
213    #[test]
214    fn plain_text_lim_at_matches_n_minus_p() {
215        let lp = PlainText::new(100);
216        assert_eq!(lp.lim_at(0), 100);
217        assert_eq!(lp.lim_at(50), 50);
218        assert_eq!(lp.lim_at(99), 1);
219        assert_eq!(lp.lim_at(100), 0);
220    }
221
222    #[test]
223    fn segmented_from_lengths_cumulates_ends() {
224        let lp = SegmentedText::from_lengths(15, &[3, 5, 7]);
225        assert_eq!(lp.n_segments(), 3);
226        assert_eq!(lp.ends(), &[3, 8, 15]);
227    }
228
229    #[test]
230    #[should_panic(expected = "sum to")]
231    fn segmented_from_lengths_rejects_undercoverage() {
232        let _ = SegmentedText::from_lengths(20, &[3, 5, 7]);
233    }
234
235    #[test]
236    fn segmented_lim_at_caps_at_next_boundary() {
237        let lp = SegmentedText::from_lengths(15, &[3, 5, 7]);
238        // Segment 0 = [0, 3): boundary at 3.
239        assert_eq!(lp.lim_at(0), 3);
240        assert_eq!(lp.lim_at(1), 2);
241        assert_eq!(lp.lim_at(2), 1);
242        // Segment 1 = [3, 8): boundary at 8.
243        assert_eq!(lp.lim_at(3), 5);
244        assert_eq!(lp.lim_at(5), 3);
245        assert_eq!(lp.lim_at(7), 1);
246        // Segment 2 = [8, 15): boundary at 15.
247        assert_eq!(lp.lim_at(8), 7);
248        assert_eq!(lp.lim_at(14), 1);
249        assert_eq!(lp.lim_at(15), 0);
250    }
251
252    #[test]
253    fn segmented_handles_single_segment_text() {
254        let lp = SegmentedText::from_lengths(10, &[10]);
255        assert_eq!(lp.lim_at(0), 10);
256        assert_eq!(lp.lim_at(5), 5);
257        assert_eq!(lp.lim_at(10), 0);
258    }
259
260    #[test]
261    fn segmented_handles_empty_text() {
262        let lp = SegmentedText::from_lengths(0, &[]);
263        assert_eq!(lp.n_segments(), 0);
264        // No suffixes to query, but the constructor accepts it.
265    }
266
267    #[test]
268    fn segmented_from_ends_matches_from_lengths() {
269        let a = SegmentedText::from_lengths(15, &[3, 5, 7]);
270        let b = SegmentedText::from_ends(15, vec![3, 8, 15]);
271        assert_eq!(a.ends(), b.ends());
272        for p in 0..=15 {
273            assert_eq!(a.lim_at(p), b.lim_at(p), "p={p}");
274        }
275    }
276}