1use crate::Index;
34use crate::lcp::{LcpDispatch, Symbol};
35use crate::limits::{LimitProvider, PlainText};
36use rayon::join;
37
38#[derive(Clone, Debug)]
40pub struct Opts {
41 pub max_context: usize,
46}
47
48impl Default for Opts {
49 fn default() -> Self {
50 Self {
51 max_context: usize::MAX,
52 }
53 }
54}
55
56pub fn build_in_memory<S, I>(text: &[S]) -> Vec<I>
67where
68 S: Symbol,
69 I: Index,
70{
71 build_in_memory_with_opts(text, &Opts::default())
72}
73
74pub fn build_in_memory_with_opts<S, I>(text: &[S], opts: &Opts) -> Vec<I>
76where
77 S: Symbol,
78 I: Index,
79{
80 build_in_memory_with(text, &PlainText::new(text.len()), opts)
81}
82
83pub fn build_in_memory_with<S, I, L>(text: &[S], lp: &L, opts: &Opts) -> Vec<I>
88where
89 S: Symbol,
90 I: Index,
91 L: LimitProvider,
92{
93 let n = text.len();
94 let positions: Vec<I> = (0..n).map(I::from_usize).collect();
95 build_in_memory_for_positions_with(text, positions, lp, opts)
96}
97
98pub fn build_in_memory_for_positions<S, I>(text: &[S], positions: Vec<I>) -> Vec<I>
113where
114 S: Symbol,
115 I: Index,
116{
117 build_in_memory_for_positions_with_opts(text, positions, &Opts::default())
118}
119
120pub fn build_in_memory_for_positions_with_opts<S, I>(
122 text: &[S],
123 positions: Vec<I>,
124 opts: &Opts,
125) -> Vec<I>
126where
127 S: Symbol,
128 I: Index,
129{
130 build_in_memory_for_positions_with(text, positions, &PlainText::new(text.len()), opts)
131}
132
133pub fn build_in_memory_for_positions_with<S, I, L>(
138 text: &[S],
139 positions: Vec<I>,
140 lp: &L,
141 opts: &Opts,
142) -> Vec<I>
143where
144 S: Symbol,
145 I: Index,
146 L: LimitProvider,
147{
148 let n = positions.len();
149 if n == 0 {
150 return Vec::new();
151 }
152
153 let mut sa: Vec<I> = positions;
154 let mut sa_w: Vec<I> = vec![I::zero(); n];
155 let mut lcp_arr: Vec<I> = vec![I::zero(); n];
156 let mut lcp_w: Vec<I> = vec![I::zero(); n];
157
158 let dispatch = LcpDispatch::detect();
162
163 merge_sort(
164 text,
165 lp,
166 &mut sa,
167 &mut sa_w,
168 &mut lcp_arr,
169 &mut lcp_w,
170 opts.max_context,
171 dispatch,
172 );
173
174 sa
175}
176
177#[allow(clippy::too_many_arguments)] pub(crate) fn merge_sort<S, I, L>(
191 text: &[S],
192 lp: &L,
193 sa: &mut [I],
194 sa_w: &mut [I],
195 lcp_arr: &mut [I],
196 lcp_w: &mut [I],
197 max_ctx: usize,
198 dispatch: LcpDispatch,
199) where
200 S: Symbol,
201 I: Index,
202 L: LimitProvider,
203{
204 let n = sa.len();
205 debug_assert_eq!(sa_w.len(), n);
206 debug_assert_eq!(lcp_arr.len(), n);
207 debug_assert_eq!(lcp_w.len(), n);
208
209 if n <= 1 {
210 if n == 1 {
211 lcp_arr[0] = I::zero();
212 }
213 return;
214 }
215
216 let mid = n / 2;
217 let (sa_l, sa_r) = sa.split_at_mut(mid);
218 let (sa_w_l, sa_w_r) = sa_w.split_at_mut(mid);
219 let (lcp_l, lcp_r) = lcp_arr.split_at_mut(mid);
220 let (lcp_w_l, lcp_w_r) = lcp_w.split_at_mut(mid);
221
222 join(
223 || merge_sort(text, lp, sa_l, sa_w_l, lcp_l, lcp_w_l, max_ctx, dispatch),
224 || merge_sort(text, lp, sa_r, sa_w_r, lcp_r, lcp_w_r, max_ctx, dispatch),
225 );
226
227 merge(
231 text, lp, sa_l, sa_r, lcp_l, lcp_r, sa_w, lcp_w, max_ctx, dispatch,
232 );
233 sa.copy_from_slice(sa_w);
234 lcp_arr.copy_from_slice(lcp_w);
235}
236
237#[allow(clippy::too_many_arguments)] pub(crate) fn merge<S, I, L>(
247 text: &[S],
248 lp: &L,
249 x: &[I],
250 y: &[I],
251 lcp_x: &[I],
252 lcp_y: &[I],
253 z: &mut [I],
254 lcp_z: &mut [I],
255 max_ctx: usize,
256 dispatch: LcpDispatch,
257) where
258 S: Symbol,
259 I: Index,
260 L: LimitProvider,
261{
262 let len_x = x.len();
263 let len_y = y.len();
264 debug_assert_eq!(z.len(), len_x + len_y);
265 debug_assert_eq!(lcp_z.len(), len_x + len_y);
266
267 if len_x == 0 {
268 z.copy_from_slice(y);
269 lcp_z.copy_from_slice(lcp_y);
270 return;
271 }
272 if len_y == 0 {
273 z.copy_from_slice(x);
274 lcp_z.copy_from_slice(lcp_x);
275 return;
276 }
277
278 let mut arr_a: &[I] = x;
285 let mut arr_b: &[I] = y;
286 let mut lcp_a: &[I] = lcp_x;
287 let mut lcp_b: &[I] = lcp_y;
288 let mut len_a = len_x;
289 let mut len_b = len_y;
290 let mut i_a: usize = 0;
291 let mut i_b: usize = 0;
292 let mut m: usize = 0;
293 let mut k: usize = 0;
294 let mut lim_a_cache: Option<(usize, usize)> = None;
295 let mut lim_b_cache: Option<(usize, usize)> = None;
296
297 while i_a < len_a && i_b < len_b {
298 let l_a = lcp_a[i_a].to_usize();
299
300 let (output_a, lcp_for_output, new_m) = if l_a > m {
302 (true, l_a, m)
303 } else if l_a < m {
304 (false, m, l_a)
305 } else {
306 let p_a = arr_a[i_a].to_usize();
308 let p_b = arr_b[i_b].to_usize();
309 let lim_a = match lim_a_cache {
315 Some((idx, lim)) if idx == i_a => lim,
316 _ => {
317 let lim = lp.lim_at(p_a);
318 lim_a_cache = Some((i_a, lim));
319 lim
320 }
321 };
322 let lim_b = match lim_b_cache {
323 Some((idx, lim)) if idx == i_b => lim,
324 _ => {
325 let lim = lp.lim_at(p_b);
326 lim_b_cache = Some((i_b, lim));
327 lim
328 }
329 };
330 let cap = lim_a.min(lim_b).min(max_ctx);
335 let remaining_ctx = cap.saturating_sub(m);
336 let ext = dispatch.lcp(text, p_a + m, p_b + m, remaining_ctx);
337 let total = m + ext;
338 let a_smaller = if total < lim_a && total < lim_b {
339 text[p_a + total] < text[p_b + total]
340 } else {
341 lp.boundary_order(p_a, lim_a, p_b, lim_b).is_lt()
349 };
350 (a_smaller, m, total)
351 };
352
353 if output_a {
354 z[k] = arr_a[i_a];
355 lcp_z[k] = I::from_usize(lcp_for_output);
356 i_a += 1;
357 lim_a_cache = None;
358 } else {
359 z[k] = arr_b[i_b];
360 lcp_z[k] = I::from_usize(lcp_for_output);
361 i_b += 1;
362 lim_b_cache = None;
363 std::mem::swap(&mut arr_a, &mut arr_b);
366 std::mem::swap(&mut lcp_a, &mut lcp_b);
367 std::mem::swap(&mut len_a, &mut len_b);
368 std::mem::swap(&mut i_a, &mut i_b);
369 std::mem::swap(&mut lim_a_cache, &mut lim_b_cache);
370 }
371 m = new_m;
372 k += 1;
373 }
374
375 drain(arr_a, lcp_a, i_a, len_a, z, lcp_z, &mut k, m);
379 drain(arr_b, lcp_b, i_b, len_b, z, lcp_z, &mut k, m);
380}
381
382#[inline]
383#[allow(clippy::too_many_arguments)] fn drain<I: Index>(
385 arr: &[I],
386 lcp_src: &[I],
387 mut i: usize,
388 len: usize,
389 z: &mut [I],
390 lcp_z: &mut [I],
391 k: &mut usize,
392 boundary_m: usize,
393) {
394 let mut first = true;
395 while i < len {
396 z[*k] = arr[i];
397 lcp_z[*k] = if first {
398 I::from_usize(boundary_m)
399 } else {
400 lcp_src[i]
401 };
402 first = false;
403 i += 1;
404 *k += 1;
405 }
406}
407
408#[cfg(test)]
409mod tests {
410 use super::*;
411
412 fn brute_force_sa(text: &[u8]) -> Vec<u32> {
414 let mut sa: Vec<u32> = (0..text.len() as u32).collect();
415 sa.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
416 sa
417 }
418
419 fn assert_matches_brute(text: &[u8]) {
420 let got: Vec<u32> = build_in_memory(text);
421 let want = brute_force_sa(text);
422 assert_eq!(got, want, "mismatch on text {text:?}");
423 }
424
425 #[test]
426 fn empty_text() {
427 let sa: Vec<u32> = build_in_memory::<u8, u32>(&[]);
428 assert!(sa.is_empty());
429 }
430
431 #[test]
432 fn single_symbol() {
433 let sa: Vec<u32> = build_in_memory(&[7u8]);
434 assert_eq!(sa, vec![0]);
435 }
436
437 #[test]
438 fn banana() {
439 assert_matches_brute(b"banana");
440 }
441
442 #[test]
443 fn mississippi() {
444 assert_matches_brute(b"mississippi");
445 }
446
447 #[test]
448 fn small_distinct_sentinel() {
449 let text: Vec<u8> = vec![0, 1, 2, 0, 1, 5, 0, 2, 1, 6];
452 let got: Vec<u32> = build_in_memory(&text);
453 let want = brute_force_sa(&text);
454 assert_eq!(got, want);
455 }
456
457 #[test]
458 fn random_byte_texts() {
459 use rand::{RngExt, SeedableRng};
460 let mut rng = rand::rngs::StdRng::seed_from_u64(0xC0FFEE);
461 for &n in &[1usize, 2, 3, 7, 33, 200, 1000] {
462 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
463 let got: Vec<u32> = build_in_memory(&text);
464 let want = brute_force_sa(&text);
465 assert_eq!(got, want, "mismatch on random text len={n}");
466 }
467 }
468
469 #[test]
470 fn for_positions_full_set_matches_build_in_memory() {
471 let text = b"banana";
473 let want: Vec<u32> = build_in_memory(text);
474 let positions: Vec<u32> = (0..text.len() as u32).collect();
475 let got = build_in_memory_for_positions(text, positions);
476 assert_eq!(got, want);
477 }
478
479 #[test]
480 fn for_positions_subset_matches_brute_force() {
481 let text = b"mississippi";
484 let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
485 let mut want = positions.clone();
486 want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
487 let got = build_in_memory_for_positions(text, positions);
488 assert_eq!(got, want);
489 }
490
491 #[test]
492 fn for_positions_random_subsets() {
493 use rand::{RngExt, SeedableRng};
494 let mut rng = rand::rngs::StdRng::seed_from_u64(0xFEED);
495 for &n in &[33usize, 200, 1000] {
496 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..6u8)).collect();
497 let mut positions: Vec<u32> = (0..n as u32).collect();
499 positions.retain(|_| rng.random_range(0..10) < 7);
501 let mut want = positions.clone();
502 want.sort_by(|&a, &b| text[a as usize..].cmp(&text[b as usize..]));
503 let got = build_in_memory_for_positions(&text, positions);
504 assert_eq!(got, want, "subset sort mismatch n={n}");
505 }
506 }
507
508 #[test]
509 fn random_with_unique_terminator() {
510 use rand::{RngExt, SeedableRng};
513 let mut rng = rand::rngs::StdRng::seed_from_u64(0xBEEF);
514 for &n in &[1usize, 50, 500] {
515 let mut text: Vec<u8> = (0..n).map(|_| rng.random_range(0..5u8)).collect();
516 text.push(250); let got: Vec<u32> = build_in_memory(&text);
518 let want = brute_force_sa(&text);
519 assert_eq!(got, want);
520 }
521 }
522
523 use crate::limits::SegmentedText;
526
527 fn segmented_cmp(text: &[u8], lp: &SegmentedText, a: usize, b: usize) -> std::cmp::Ordering {
530 use crate::limits::LimitProvider;
531 let lim_a = lp.lim_at(a);
532 let lim_b = lp.lim_at(b);
533 let lim = lim_a.min(lim_b);
534 for i in 0..lim {
535 if text[a + i] != text[b + i] {
536 return text[a + i].cmp(&text[b + i]);
537 }
538 }
539 lim_a.cmp(&lim_b)
540 }
541
542 fn assert_segmented_sa_valid(text: &[u8], lengths: &[usize], positions: &[u32], sa: &[u32]) {
552 let lp = SegmentedText::from_lengths(text.len(), lengths);
553 let mut expected = positions.to_vec();
554 expected.sort();
555 let mut got_sorted = sa.to_vec();
556 got_sorted.sort();
557 assert_eq!(got_sorted, expected, "sa is not a permutation of positions");
558 for w in sa.windows(2) {
559 let a = w[0] as usize;
560 let b = w[1] as usize;
561 let ord = segmented_cmp(text, &lp, a, b);
562 assert_ne!(
563 ord,
564 std::cmp::Ordering::Greater,
565 "out of order: pos {a} > pos {b} under segmented comparator",
566 );
567 }
568 }
569
570 #[test]
571 fn segmented_in_memory_matches_brute_force_small() {
572 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
574 let lengths = &[5usize, 5, 6, 11];
575 let lp = SegmentedText::from_lengths(text.len(), lengths);
576 let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
577 let all_positions: Vec<u32> = (0..text.len() as u32).collect();
578 assert_segmented_sa_valid(&text, lengths, &all_positions, &sa);
579 }
580
581 #[test]
582 fn segmented_single_segment_equals_unsegmented() {
583 let text = b"mississippi";
587 let lp = SegmentedText::from_lengths(text.len(), &[text.len()]);
588 let got_segmented: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
589 let got_plain: Vec<u32> = build_in_memory(text);
590 assert_eq!(got_segmented, got_plain);
591 }
592
593 #[test]
594 fn segmented_random_validity() {
595 use rand::{RngExt, SeedableRng};
596 let mut rng = rand::rngs::StdRng::seed_from_u64(0x5E6);
597 for _ in 0..20 {
598 let n_segments = rng.random_range(1..10usize);
599 let lengths: Vec<usize> = (0..n_segments)
600 .map(|_| rng.random_range(5..50usize))
601 .collect();
602 let n: usize = lengths.iter().sum();
603 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
605 let lp = SegmentedText::from_lengths(n, &lengths);
606 let sa: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
607 let all_positions: Vec<u32> = (0..n as u32).collect();
608 assert_segmented_sa_valid(&text, &lengths, &all_positions, &sa);
609 }
610 }
611
612 #[test]
613 fn segmented_for_positions_subset_validity() {
614 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
616 let lengths = &[5usize, 5, 6, 11];
617 let positions: Vec<u32> = (0..text.len() as u32).step_by(2).collect();
618 let lp = SegmentedText::from_lengths(text.len(), lengths);
619 let sa =
620 build_in_memory_for_positions_with(&text, positions.clone(), &lp, &Opts::default());
621 assert_segmented_sa_valid(&text, lengths, &positions, &sa);
622 }
623
624 struct StarConvention {
633 inner: SegmentedText,
634 }
635
636 impl crate::limits::LimitProvider for StarConvention {
637 fn lim_at(&self, p: usize) -> usize {
638 self.inner.lim_at(p)
639 }
640 fn boundary_order(
641 &self,
642 p_a: usize,
643 lim_a: usize,
644 p_b: usize,
645 lim_b: usize,
646 ) -> std::cmp::Ordering {
647 lim_b.cmp(&lim_a).then(p_a.cmp(&p_b))
648 }
649 }
650
651 fn star_brute_force_sa(text: &[u8], lengths: &[usize]) -> Vec<u32> {
656 use crate::limits::LimitProvider;
657 let lp = SegmentedText::from_lengths(text.len(), lengths);
658 let mut sa: Vec<u32> = (0..text.len() as u32).collect();
659 sa.sort_by(|&a, &b| {
660 let pa = a as usize;
661 let pb = b as usize;
662 let lim_a = lp.lim_at(pa);
663 let lim_b = lp.lim_at(pb);
664 let lim = lim_a.min(lim_b);
665 for i in 0..lim {
666 if text[pa + i] != text[pb + i] {
667 return text[pa + i].cmp(&text[pb + i]);
668 }
669 }
670 lim_b.cmp(&lim_a).then(pa.cmp(&pb))
672 });
673 sa
674 }
675
676 #[test]
677 fn star_convention_matches_brute_force_small() {
678 let text: Vec<u8> = b"helloworldbananamississippi".to_vec();
680 let lengths = &[5usize, 5, 6, 11];
681 let lp = StarConvention {
682 inner: SegmentedText::from_lengths(text.len(), lengths),
683 };
684 let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
685 let want = star_brute_force_sa(&text, lengths);
686 assert_eq!(got, want, "STAR-convention SA mismatch");
687 }
688
689 #[test]
694 fn star_convention_within_segment_longer_first() {
695 let text = b"AAAA";
696 let lp = StarConvention {
697 inner: SegmentedText::from_lengths(text.len(), &[text.len()]),
698 };
699 let got: Vec<u32> = build_in_memory_with(text, &lp, &Opts::default());
700 assert_eq!(got, vec![0u32, 1, 2, 3]);
704 }
705
706 #[test]
707 fn star_convention_random_matches_brute_force() {
708 use rand::{RngExt, SeedableRng};
709 let mut rng = rand::rngs::StdRng::seed_from_u64(0xCAFE);
710 for _ in 0..20 {
711 let n_segments = rng.random_range(1..10usize);
712 let lengths: Vec<usize> = (0..n_segments)
713 .map(|_| rng.random_range(5..50usize))
714 .collect();
715 let n: usize = lengths.iter().sum();
716 let text: Vec<u8> = (0..n).map(|_| rng.random_range(0..3u8)).collect();
718 let lp = StarConvention {
719 inner: SegmentedText::from_lengths(n, &lengths),
720 };
721 let got: Vec<u32> = build_in_memory_with(&text, &lp, &Opts::default());
722 let want = star_brute_force_sa(&text, &lengths);
723 assert_eq!(
724 got, want,
725 "STAR-convention SA mismatch (lengths={lengths:?}, text={text:?})",
726 );
727 }
728 }
729}