1use crate::Index;
34use crate::lcp::{LcpDispatch, Symbol};
35use crate::limits::{LimitProvider, PlainText};
36use rayon::join;
37
38#[derive(Clone, Debug)]
40pub struct Opts {
41 pub max_context: usize,
46}
47
48impl Default for Opts {
49 fn default() -> Self {
50 Self {
51 max_context: usize::MAX,
52 }
53 }
54}
55
56pub fn build_in_memory<S, I>(text: &[S]) -> Vec<I>
67where
68 S: Symbol,
69 I: Index,
70{
71 build_in_memory_with_opts(text, &Opts::default())
72}
73
74pub fn build_in_memory_with_opts<S, I>(text: &[S], opts: &Opts) -> Vec<I>
76where
77 S: Symbol,
78 I: Index,
79{
80 build_in_memory_with(text, &PlainText::new(text.len()), opts)
81}
82
83pub fn build_in_memory_with<S, I, L>(text: &[S], lp: &L, opts: &Opts) -> Vec<I>
88where
89 S: Symbol,
90 I: Index,
91 L: LimitProvider,
92{
93 let n = text.len();
94 let positions: Vec<I> = (0..n).map(I::from_usize).collect();
95 build_in_memory_for_positions_with(text, positions, lp, opts)
96}
97
98pub fn build_in_memory_for_positions<S, I>(text: &[S], positions: Vec<I>) -> Vec<I>
113where
114 S: Symbol,
115 I: Index,
116{
117 build_in_memory_for_positions_with_opts(text, positions, &Opts::default())
118}
119
120pub fn build_in_memory_for_positions_with_opts<S, I>(
122 text: &[S],
123 positions: Vec<I>,
124 opts: &Opts,
125) -> Vec<I>
126where
127 S: Symbol,
128 I: Index,
129{
130 build_in_memory_for_positions_with(text, positions, &PlainText::new(text.len()), opts)
131}
132
133pub fn build_in_memory_for_positions_with<S, I, L>(
138 text: &[S],
139 positions: Vec<I>,
140 lp: &L,
141 opts: &Opts,
142) -> Vec<I>
143where
144 S: Symbol,
145 I: Index,
146 L: LimitProvider,
147{
148 let n = positions.len();
149 if n == 0 {
150 return Vec::new();
151 }
152
153 let mut sa: Vec<I> = positions;
154 let mut sa_w: Vec<I> = vec![I::zero(); n];
155 let mut lcp_arr: Vec<I> = vec![I::zero(); n];
156 let mut lcp_w: Vec<I> = vec![I::zero(); n];
157
158 let dispatch = LcpDispatch::detect();
162
163 merge_sort(
164 text,
165 lp,
166 &mut sa,
167 &mut sa_w,
168 &mut lcp_arr,
169 &mut lcp_w,
170 opts.max_context,
171 dispatch,
172 );
173
174 sa
175}
176
177#[allow(clippy::too_many_arguments)] pub(crate) fn merge_sort<S, I, L>(
191 text: &[S],
192 lp: &L,
193 sa: &mut [I],
194 sa_w: &mut [I],
195 lcp_arr: &mut [I],
196 lcp_w: &mut [I],
197 max_ctx: usize,
198 dispatch: LcpDispatch,
199) where
200 S: Symbol,
201 I: Index,
202 L: LimitProvider,
203{
204 let n = sa.len();
205 debug_assert_eq!(sa_w.len(), n);
206 debug_assert_eq!(lcp_arr.len(), n);
207 debug_assert_eq!(lcp_w.len(), n);
208
209 if n <= 1 {
210 if n == 1 {
211 lcp_arr[0] = I::zero();
212 }
213 return;
214 }
215
216 let mid = n / 2;
217 let (sa_l, sa_r) = sa.split_at_mut(mid);
218 let (sa_w_l, sa_w_r) = sa_w.split_at_mut(mid);
219 let (lcp_l, lcp_r) = lcp_arr.split_at_mut(mid);
220 let (lcp_w_l, lcp_w_r) = lcp_w.split_at_mut(mid);
221
222 join(
223 || merge_sort(text, lp, sa_l, sa_w_l, lcp_l, lcp_w_l, max_ctx, dispatch),
224 || merge_sort(text, lp, sa_r, sa_w_r, lcp_r, lcp_w_r, max_ctx, dispatch),
225 );
226
227 merge(
231 text, lp, sa_l, sa_r, lcp_l, lcp_r, sa_w, lcp_w, max_ctx, dispatch,
232 );
233 sa.copy_from_slice(sa_w);
234 lcp_arr.copy_from_slice(lcp_w);
235}
236
237#[allow(clippy::too_many_arguments)] pub(crate) fn merge<S, I, L>(
247 text: &[S],
248 lp: &L,
249 x: &[I],
250 y: &[I],
251 lcp_x: &[I],
252 lcp_y: &[I],
253 z: &mut [I],
254 lcp_z: &mut [I],
255 max_ctx: usize,
256 dispatch: LcpDispatch,
257) where
258 S: Symbol,
259 I: Index,
260 L: LimitProvider,
261{
262 let len_x = x.len();
263 let len_y = y.len();
264 debug_assert_eq!(z.len(), len_x + len_y);
265 debug_assert_eq!(lcp_z.len(), len_x + len_y);
266
267 if len_x == 0 {
268 z.copy_from_slice(y);
269 lcp_z.copy_from_slice(lcp_y);
270 return;
271 }
272 if len_y == 0 {
273 z.copy_from_slice(x);
274 lcp_z.copy_from_slice(lcp_x);
275 return;
276 }
277
278 let mut arr_a: &[I] = x;
285 let mut arr_b: &[I] = y;
286 let mut lcp_a: &[I] = lcp_x;
287 let mut lcp_b: &[I] = lcp_y;
288 let mut len_a = len_x;
289 let mut len_b = len_y;
290 let mut i_a: usize = 0;
291 let mut i_b: usize = 0;
292 let mut m: usize = 0;
293 let mut k: usize = 0;
294
295 while i_a < len_a && i_b < len_b {
296 let l_a = lcp_a[i_a].to_usize();
297
298 let (output_a, lcp_for_output, new_m) = if l_a > m {
300 (true, l_a, m)
301 } else if l_a < m {
302 (false, m, l_a)
303 } else {
304 let p_a = arr_a[i_a].to_usize();
306 let p_b = arr_b[i_b].to_usize();
307 let lim_a = lp.lim_at(p_a);
313 let lim_b = lp.lim_at(p_b);
314 let cap = lim_a.min(lim_b).min(max_ctx);
319 let remaining_ctx = cap.saturating_sub(m);
320 let ext = dispatch.lcp(text, p_a + m, p_b + m, remaining_ctx);
321 let total = m + ext;
322 let a_smaller = if total < lim_a && total < lim_b {
323 text[p_a + total] < text[p_b + total]
324 } else {
325 lp.boundary_order(p_a, lim_a, p_b, lim_b).is_lt()
333 };
334 (a_smaller, m, total)
335 };
336
337 if output_a {
338 z[k] = arr_a[i_a];
339 lcp_z[k] = I::from_usize(lcp_for_output);
340 i_a += 1;
341 } else {
342 z[k] = arr_b[i_b];
343 lcp_z[k] = I::from_usize(lcp_for_output);
344 i_b += 1;
345 std::mem::swap(&mut arr_a, &mut arr_b);
348 std::mem::swap(&mut lcp_a, &mut lcp_b);
349 std::mem::swap(&mut len_a, &mut len_b);
350 std::mem::swap(&mut i_a, &mut i_b);
351 }
352 m = new_m;
353 k += 1;
354 }
355
356 drain(arr_a, lcp_a, i_a, len_a, z, lcp_z, &mut k, m);
360 drain(arr_b, lcp_b, i_b, len_b, z, lcp_z, &mut k, m);
361}
362
363#[inline]
364#[allow(clippy::too_many_arguments)] fn drain<I: Index>(
366 arr: &[I],
367 lcp_src: &[I],
368 mut i: usize,
369 len: usize,
370 z: &mut [I],
371 lcp_z: &mut [I],
372 k: &mut usize,
373 boundary_m: usize,
374) {
375 let mut first = true;
376 while i < len {
377 z[*k] = arr[i];
378 lcp_z[*k] = if first {
379 I::from_usize(boundary_m)
380 } else {
381 lcp_src[i]
382 };
383 first = false;
384 i += 1;
385 *k += 1;
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392
393 fn brute_force_sa(text: &[u8]) -> Vec<u32> {
395 let mut sa: Vec<u32> = (0..text.len() as u32).collect();
396 sa.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
397 sa
398 }
399
400 fn assert_matches_brute(text: &[u8]) {
401 let got: Vec<u32> = build_in_memory(text);
402 let want = brute_force_sa(text);
403 assert_eq!(got, want, "mismatch on text {text:?}");
404 }
405
406 #[test]
407 fn empty_text() {
408 let sa: Vec<u32> = build_in_memory::<u8, u32>(&[]);
409 assert!(sa.is_empty());
410 }
411
412 #[test]
413 fn single_symbol() {
414 let sa: Vec<u32> = build_in_memory(&[7u8]);
415 assert_eq!(sa, vec![0]);
416 }
417
418 #[test]
419 fn banana() {
420 assert_matches_brute(b"banana");
421 }
422
423 #[test]
424 fn mississippi() {
425 assert_matches_brute(b"mississippi");
426 }
427
428 #[test]
429 fn small_distinct_sentinel() {
430 let text: Vec<u8> = vec![0, 1, 2, 0, 1, 5, 0, 2, 1, 6];
433 let got: Vec<u32> = build_in_memory(&text);
434 let want = brute_force_sa(&text);
435 assert_eq!(got, want);
436 }
437
438 #[test]
439 fn random_byte_texts() {
440 use rand::{RngExt, SeedableRng};
441 let mut rng = rand::rngs::StdRng::seed_from_u64(0xC0FFEE);
442 for &n in &[1usize, 2, 3, 7, 33, 200, 1000] {
443 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
444 let got: Vec<u32> = build_in_memory(&text);
445 let want = brute_force_sa(&text);
446 assert_eq!(got, want, "mismatch on random text len={n}");
447 }
448 }
449
450 #[test]
451 fn for_positions_full_set_matches_build_in_memory() {
452 let text = b"banana";
454 let want: Vec<u32> = build_in_memory(text);
455 let positions: Vec<u32> = (0..text.len() as u32).collect();
456 let got = build_in_memory_for_positions(text, positions);
457 assert_eq!(got, want);
458 }
459
460 #[test]
461 fn for_positions_subset_matches_brute_force() {
462 let text = b"mississippi";
465 let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
466 let mut want = positions.clone();
467 want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
468 let got = build_in_memory_for_positions(text, positions);
469 assert_eq!(got, want);
470 }
471
472 #[test]
473 fn for_positions_random_subsets() {
474 use rand::{RngExt, SeedableRng};
475 let mut rng = rand::rngs::StdRng::seed_from_u64(0xFEED);
476 for &n in &[33usize, 200, 1000] {
477 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
478 let mut positions: Vec<u32> = (0..n as u32).collect();
480 positions.retain(|_| rng.random_range(0..10) < 7);
482 let mut want = positions.clone();
483 want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
484 let got = build_in_memory_for_positions(&text, positions);
485 assert_eq!(got, want, "subset sort mismatch n={n}");
486 }
487 }
488
489 #[test]
490 fn random_with_unique_terminator() {
491 use rand::{RngExt, SeedableRng};
494 let mut rng = rand::rngs::StdRng::seed_from_u64(0xBEEF);
495 for &n in &[1usize, 50, 500] {
496 let mut text: Vec<u8> = (0..n).map(|_| rng.random_range(0..5u8)).collect();
497 text.push(250); let got: Vec<u32> = build_in_memory(&text);
499 let want = brute_force_sa(&text);
500 assert_eq!(got, want);
501 }
502 }
503
504 use crate::limits::SegmentedText;
507
508 fn segmented_cmp(text: &[u8], lp: &SegmentedText, a: usize, b: usize) -> std::cmp::Ordering {
511 use crate::limits::LimitProvider;
512 let lim_a = lp.lim_at(a);
513 let lim_b = lp.lim_at(b);
514 let lim = lim_a.min(lim_b);
515 for i in 0..lim {
516 if text[a + i] != text[b + i] {
517 return text[a + i].cmp(&text[b + i]);
518 }
519 }
520 lim_a.cmp(&lim_b)
521 }
522
523 fn assert_segmented_sa_valid(text: &[u8], lengths: &[usize], positions: &[u32], sa: &[u32]) {
533 let lp = SegmentedText::from_lengths(text.len(), lengths);
534 let mut expected = positions.to_vec();
535 expected.sort();
536 let mut got_sorted = sa.to_vec();
537 got_sorted.sort();
538 assert_eq!(got_sorted, expected, "sa is not a permutation of positions");
539 for w in sa.windows(2) {
540 let a = w[0] as usize;
541 let b = w[1] as usize;
542 let ord = segmented_cmp(text, &lp, a, b);
543 assert_ne!(
544 ord,
545 std::cmp::Ordering::Greater,
546 "out of order: pos {a} > pos {b} under segmented comparator",
547 );
548 }
549 }
550
551 #[test]
552 fn segmented_in_memory_matches_brute_force_small() {
553 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
555 let lengths = &[5usize, 5, 6, 11];
556 let lp = SegmentedText::from_lengths(text.len(), lengths);
557 let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
558 let all_positions: Vec<u32> = (0..text.len() as u32).collect();
559 assert_segmented_sa_valid(&text, lengths, &all_positions, &sa);
560 }
561
562 #[test]
563 fn segmented_single_segment_equals_unsegmented() {
564 let text = b"mississippi";
568 let lp = SegmentedText::from_lengths(text.len(), &[text.len()]);
569 let got_segmented: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
570 let got_plain: Vec<u32> = build_in_memory(text);
571 assert_eq!(got_segmented, got_plain);
572 }
573
574 #[test]
575 fn segmented_random_validity() {
576 use rand::{RngExt, SeedableRng};
577 let mut rng = rand::rngs::StdRng::seed_from_u64(0x5E6);
578 for _ in 0..20 {
579 let n_segments = rng.random_range(1..10usize);
580 let lengths: Vec<usize> = (0..n_segments)
581 .map(|_| rng.random_range(5..50usize))
582 .collect();
583 let n: usize = lengths.iter().sum();
584 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
586 let lp = SegmentedText::from_lengths(n, &lengths);
587 let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
588 let all_positions: Vec<u32> = (0..n as u32).collect();
589 assert_segmented_sa_valid(&text, &lengths, &all_positions, &sa);
590 }
591 }
592
593 #[test]
594 fn segmented_for_positions_subset_validity() {
595 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
597 let lengths = &[5usize, 5, 6, 11];
598 let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
599 let lp = SegmentedText::from_lengths(text.len(), lengths);
600 let sa =
601 build_in_memory_for_positions_with(&text, positions.clone(), &lp, &Opts::default());
602 assert_segmented_sa_valid(&text, lengths, &positions, &sa);
603 }
604
605 struct StarConvention {
614 inner: SegmentedText,
615 }
616
617 impl crate::limits::LimitProvider for StarConvention {
618 fn lim_at(&self, p: usize) -> usize {
619 self.inner.lim_at(p)
620 }
621 fn boundary_order(
622 &self,
623 p_a: usize,
624 lim_a: usize,
625 p_b: usize,
626 lim_b: usize,
627 ) -> std::cmp::Ordering {
628 lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
629 }
630 }
631
632 fn star_brute_force_sa(text: &[u8], lengths: &[usize]) -> Vec<u32> {
637 use crate::limits::LimitProvider;
638 let lp = SegmentedText::from_lengths(text.len(), lengths);
639 let mut sa: Vec<u32> = (0..text.len() as u32).collect();
640 sa.sort_by(|&a, &b| {
641 let pa = a as usize;
642 let pb = b as usize;
643 let lim_a = lp.lim_at(pa);
644 let lim_b = lp.lim_at(pb);
645 let lim = lim_a.min(lim_b);
646 for i in 0..lim {
647 if text[pa + i] != text[pb + i] {
648 return text[pa + i].cmp(&text[pb + i]);
649 }
650 }
651 lim_b.cmp(&lim_a).then(pa.cmp(&pb))
653 });
654 sa
655 }
656
657 #[test]
658 fn star_convention_matches_brute_force_small() {
659 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
661 let lengths = &[5usize, 5, 6, 11];
662 let lp = StarConvention {
663 inner: SegmentedText::from_lengths(text.len(), lengths),
664 };
665 let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
666 let want = star_brute_force_sa(&text, lengths);
667 assert_eq!(got, want, "STAR-convention SA mismatch");
668 }
669
670 #[test]
675 fn star_convention_within_segment_longer_first() {
676 let text = b"AAAA";
677 let lp = StarConvention {
678 inner: SegmentedText::from_lengths(text.len(), &[text.len()]),
679 };
680 let got: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
681 assert_eq!(got, vec![0u32, 1, 2, 3]);
685 }
686
687 #[test]
688 fn star_convention_random_matches_brute_force() {
689 use rand::{RngExt, SeedableRng};
690 let mut rng = rand::rngs::StdRng::seed_from_u64(0xCAFE);
691 for _ in 0..20 {
692 let n_segments = rng.random_range(1..10usize);
693 let lengths: Vec<usize> = (0..n_segments)
694 .map(|_| rng.random_range(5..50usize))
695 .collect();
696 let n: usize = lengths.iter().sum();
697 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
699 let lp = StarConvention {
700 inner: SegmentedText::from_lengths(n, &lengths),
701 };
702 let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
703 let want = star_brute_force_sa(&text, &lengths);
704 assert_eq!(
705 got, want,
706 "STAR-convention SA mismatch (lengths={lengths:?}, text={text:?})",
707 );
708 }
709 }
710}