caps_sa/limits.rs
1//! Per-suffix length providers for segmented suffix-array construction.
2//!
3//! In the standard SA construction the "natural length" of the suffix
4//! starting at position `p` is `text.len() - p`. For *segmented* texts
5//! (multi-string SAs, splice-junction indexes, etc.) we want LCP
6//! comparisons to stop at the next segment boundary instead — the
7//! suffix logically ends there, and the merge resolves cross-segment
8//! ordering by "shorter-suffix-is-smaller" (the standard generalised-SA
9//! convention).
10//!
11//! The [`LimitProvider`] trait abstracts the per-suffix length lookup
12//! and is plumbed through every site in `merge` / `cascade_merge` /
13//! `suffix_cmp` that previously computed `n - p` inline.
14//! [`PlainText`] is the zero-cost default — its `lim_at` is
15//! `#[inline(always)]` and folds to the same `n - p` expression the
16//! current code emits, so the non-segmented path generates **bit-
17//! identical assembly** to today's after monomorphization.
18//! [`SegmentedText`] holds a sorted cumulative-ends `Vec<u64>` and
19//! does a `partition_point` per lookup; the merge can cache the
20//! result across LCP calls so the cost amortises to ~one binary
21//! search per output record.
22//!
23//! See `bench/README.md` "Approach 3 — segmented LCP" for the design
24//! rationale and the comparison against the `[u8; 3]` (24-bit-text)
25//! alternative.
26
27/// Per-suffix length provider. The merge and cascade-merge code use
28/// `lp.lim_at(p)` instead of `text.len() - p`; the LCP function itself
29/// is unchanged (the merge passes the appropriately-capped
30/// `max_ctx` to the existing SIMD path).
31///
32/// Implementations must be `Sync` so the rayon-parallel sort can
33/// share one provider across worker threads.
34pub trait LimitProvider: Sync {
35 /// Logical length of the suffix starting at position `p` in
36 /// symbols — i.e. the number of comparable symbols before the
37 /// next segment boundary or end-of-text. Must be at most
38 /// `text.len() - p`.
39 fn lim_at(&self, p: usize) -> usize;
40
41 /// Order to resolve when one or both suffixes hit their boundary
42 /// before any byte of their shared prefix differs. The default
43 /// is `lim_a.cmp(&lim_b)` — "shorter-suffix-is-smaller", the
44 /// standard generalised-SA / multi-string-SA convention, what a
45 /// `Vec<&str>` sort with `&str` ordering produces.
46 ///
47 /// Custom impls can override for different boundary conventions.
48 /// The motivating example is STAR's `spacer-as-largest` ordering:
49 /// the suffix that hits a spacer first is *larger*, equivalently
50 /// the longer-`lim` one is smaller, with an ascending-position
51 /// tie-break when both `lim`s coincide:
52 ///
53 /// ```ignore
54 /// fn boundary_order(&self, p_a: usize, lim_a: usize,
55 /// p_b: usize, lim_b: usize) -> Ordering {
56 /// lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
57 /// }
58 /// ```
59 ///
60 /// `p_a` / `p_b` are the suffix start positions in the same
61 /// coordinate space the merge sees (the spacer-free text's
62 /// coordinates when invoked through `*_with` entries on a
63 /// rustar-aligner-style spacer-free text). The default impl
64 /// ignores them; impls that want a position tie-break use them.
65 #[inline]
66 fn boundary_order(
67 &self,
68 p_a: usize,
69 lim_a: usize,
70 p_b: usize,
71 lim_b: usize,
72 ) -> std::cmp::Ordering {
73 let _ = (p_a, p_b);
74 lim_a.cmp(&lim_b)
75 }
76}
77
78/// Default provider for non-segmented texts: `lim_at(p) = n - p`.
79/// Stored as a single `usize`; the `#[inline(always)]` `lim_at`
80/// folds at monomorphization time into the same `n - p` the merge
81/// used before this abstraction existed, so non-segmented callers
82/// pay zero overhead.
83#[derive(Copy, Clone, Debug)]
84pub struct PlainText {
85 /// Total text length in symbols.
86 pub n: usize,
87}
88
89impl PlainText {
90 /// New `PlainText` for a text of `n` symbols.
91 #[inline]
92 pub fn new(n: usize) -> Self {
93 Self { n }
94 }
95}
96
97impl LimitProvider for PlainText {
98 #[inline(always)]
99 fn lim_at(&self, p: usize) -> usize {
100 self.n - p
101 }
102}
103
104/// Provider for texts partitioned into segments at known cumulative
105/// end positions. `lim_at(p)` binary-searches the sorted ends list
106/// and returns the distance from `p` to the next boundary.
107///
108/// Storage cost is `8 × n_segments` bytes (the cumulative-ends
109/// `Vec<u64>`). For a 50 K-junction SA index on a 6 GB genome that
110/// is **400 KB total** — vs the 750 MB a packed bitmap would need,
111/// and the 6 GB an extra-byte-per-symbol u16 text would need.
112///
113/// Lookup is `O(log n_segments)` — a few cycles for typical
114/// segment counts. The merge can cache `lim_p`/`lim_q` across LCP
115/// calls so the cost amortises to ~one binary search per output
116/// record.
117///
118/// Two constructors:
119/// - [`from_lengths`][Self::from_lengths] takes per-segment lengths
120/// and builds the cumulative-ends list internally. Most ergonomic
121/// when the caller has `[chr_len_0, chr_len_1, …]` already.
122/// - [`from_ends`][Self::from_ends] takes the sorted cumulative
123/// ends directly. Useful when the caller already has them — e.g.
124/// STAR's `chr_start[]` table.
125///
126/// Both constructors require the segments to cover the whole text
127/// (`sum(lengths) == text_len`, or `ends.last() == Some(text_len)`).
128#[derive(Clone, Debug)]
129pub struct SegmentedText {
130 n: usize,
131 /// Sorted, strictly-increasing cumulative end positions. After
132 /// segment 0 of length 100 ends at index 100, `ends[0] = 100`.
133 /// After segment 1 of length 50 (positions 100..150),
134 /// `ends[1] = 150`. The last entry equals the total text length.
135 ends: Vec<u64>,
136}
137
138impl SegmentedText {
139 /// Build from per-segment lengths. The sum must equal `text_len`.
140 pub fn from_lengths(text_len: usize, lengths: &[usize]) -> Self {
141 let mut ends = Vec::with_capacity(lengths.len());
142 let mut cum: u64 = 0;
143 for &len in lengths {
144 cum += len as u64;
145 ends.push(cum);
146 }
147 assert_eq!(
148 cum as usize, text_len,
149 "SegmentedText::from_lengths: per-segment lengths sum to {cum} but text_len is {text_len}",
150 );
151 Self { n: text_len, ends }
152 }
153
154 /// Build from sorted, strictly-increasing cumulative end positions.
155 /// `ends.last()` must equal `text_len`.
156 pub fn from_ends(text_len: usize, ends: Vec<u64>) -> Self {
157 assert!(
158 ends.windows(2).all(|w| w[0] < w[1]),
159 "SegmentedText::from_ends: ends must be strictly increasing",
160 );
161 match ends.last() {
162 Some(&last) => assert_eq!(
163 last as usize, text_len,
164 "SegmentedText::from_ends: last end ({last}) != text_len ({text_len})",
165 ),
166 None => assert_eq!(
167 text_len, 0,
168 "SegmentedText::from_ends: empty ends but text_len ({text_len}) != 0",
169 ),
170 }
171 Self { n: text_len, ends }
172 }
173
174 /// Total text length in symbols.
175 #[inline]
176 pub fn text_len(&self) -> usize {
177 self.n
178 }
179
180 /// Number of segments.
181 #[inline]
182 pub fn n_segments(&self) -> usize {
183 self.ends.len()
184 }
185
186 /// Cumulative end positions, sorted, strictly increasing.
187 /// `ends()[i]` is the position one past the last symbol of
188 /// segment `i`.
189 #[inline]
190 pub fn ends(&self) -> &[u64] {
191 &self.ends
192 }
193}
194
195impl LimitProvider for SegmentedText {
196 #[inline]
197 fn lim_at(&self, p: usize) -> usize {
198 // First boundary strictly greater than p.
199 let i = self.ends.partition_point(|&b| b <= p as u64);
200 if i < self.ends.len() {
201 self.ends[i] as usize - p
202 } else {
203 // p past the last boundary: just text-end.
204 self.n - p
205 }
206 }
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212
213 #[test]
214 fn plain_text_lim_at_matches_n_minus_p() {
215 let lp = PlainText::new(100);
216 assert_eq!(lp.lim_at(0), 100);
217 assert_eq!(lp.lim_at(50), 50);
218 assert_eq!(lp.lim_at(99), 1);
219 assert_eq!(lp.lim_at(100), 0);
220 }
221
222 #[test]
223 fn segmented_from_lengths_cumulates_ends() {
224 let lp = SegmentedText::from_lengths(15, &[3, 5, 7]);
225 assert_eq!(lp.n_segments(), 3);
226 assert_eq!(lp.ends(), &[3, 8, 15]);
227 }
228
229 #[test]
230 #[should_panic(expected = "sum to")]
231 fn segmented_from_lengths_rejects_undercoverage() {
232 let _ = SegmentedText::from_lengths(20, &[3, 5, 7]);
233 }
234
235 #[test]
236 fn segmented_lim_at_caps_at_next_boundary() {
237 let lp = SegmentedText::from_lengths(15, &[3, 5, 7]);
238 // Segment 0 = [0, 3): boundary at 3.
239 assert_eq!(lp.lim_at(0), 3);
240 assert_eq!(lp.lim_at(1), 2);
241 assert_eq!(lp.lim_at(2), 1);
242 // Segment 1 = [3, 8): boundary at 8.
243 assert_eq!(lp.lim_at(3), 5);
244 assert_eq!(lp.lim_at(5), 3);
245 assert_eq!(lp.lim_at(7), 1);
246 // Segment 2 = [8, 15): boundary at 15.
247 assert_eq!(lp.lim_at(8), 7);
248 assert_eq!(lp.lim_at(14), 1);
249 assert_eq!(lp.lim_at(15), 0);
250 }
251
252 #[test]
253 fn segmented_handles_single_segment_text() {
254 let lp = SegmentedText::from_lengths(10, &[10]);
255 assert_eq!(lp.lim_at(0), 10);
256 assert_eq!(lp.lim_at(5), 5);
257 assert_eq!(lp.lim_at(10), 0);
258 }
259
260 #[test]
261 fn segmented_handles_empty_text() {
262 let lp = SegmentedText::from_lengths(0, &[]);
263 assert_eq!(lp.n_segments(), 0);
264 // No suffixes to query, but the constructor accepts it.
265 }
266
267 #[test]
268 fn segmented_from_ends_matches_from_lengths() {
269 let a = SegmentedText::from_lengths(15, &[3, 5, 7]);
270 let b = SegmentedText::from_ends(15, vec![3, 8, 15]);
271 assert_eq!(a.ends(), b.ends());
272 for p in 0..=15 {
273 assert_eq!(a.lim_at(p), b.lim_at(p), "p={p}");
274 }
275 }
276}