1#[derive(Debug, Clone)]
13pub struct SuffixArray {
14 sa: Vec<usize>,
15}
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23enum SuffixType {
24 S,
25 L,
26}
27
28fn classify_suffixes(text: &[usize]) -> Vec<SuffixType> {
34 let n = text.len();
35 if n == 0 {
36 return vec![];
37 }
38 let mut types = vec![SuffixType::S; n];
39 types[n - 1] = SuffixType::S;
41
42 if n > 1 {
43 for i in (0..n - 1).rev() {
45 if text[i] > text[i + 1] {
46 types[i] = SuffixType::L;
47 } else if text[i] < text[i + 1] {
48 types[i] = SuffixType::S;
49 } else {
50 types[i] = types[i + 1];
51 }
52 }
53 }
54
55 types
56}
57
58#[inline]
60fn is_lms(types: &[SuffixType], i: usize) -> bool {
61 i > 0 && types[i] == SuffixType::S && types[i - 1] == SuffixType::L
62}
63
64fn get_bucket_sizes(text: &[usize], alphabet_size: usize) -> Vec<usize> {
66 let mut buckets = vec![0usize; alphabet_size];
67 for &c in text {
68 buckets[c] += 1;
69 }
70 buckets
71}
72
73fn get_bucket_starts(bucket_sizes: &[usize]) -> Vec<usize> {
75 let mut starts = vec![0usize; bucket_sizes.len()];
76 let mut sum = 0;
77 for (i, &size) in bucket_sizes.iter().enumerate() {
78 starts[i] = sum;
79 sum += size;
80 }
81 starts
82}
83
84fn get_bucket_ends(bucket_sizes: &[usize]) -> Vec<usize> {
86 let mut ends = vec![0usize; bucket_sizes.len()];
87 let mut sum = 0;
88 for (i, &size) in bucket_sizes.iter().enumerate() {
89 sum += size;
90 ends[i] = sum;
91 }
92 ends
93}
94
95fn place_lms(
97 sa: &mut [usize],
98 text: &[usize],
99 _types: &[SuffixType],
100 bucket_sizes: &[usize],
101 lms_positions: &[usize],
102) {
103 let n = sa.len();
104 let sentinel = n; for slot in sa.iter_mut() {
106 *slot = sentinel;
107 }
108
109 let mut tails = get_bucket_ends(bucket_sizes);
110
111 for &pos in lms_positions.iter().rev() {
113 let c = text[pos];
114 tails[c] -= 1;
115 sa[tails[c]] = pos;
116 }
117}
118
119fn induce_l(
121 sa: &mut [usize],
122 text: &[usize],
123 types: &[SuffixType],
124 bucket_sizes: &[usize],
125) {
126 let n = sa.len();
127 let sentinel = n;
128 let mut heads = get_bucket_starts(bucket_sizes);
129
130 for i in 0..n {
131 if sa[i] == sentinel || sa[i] == 0 {
132 continue;
133 }
134 let j = sa[i] - 1;
135 if types[j] == SuffixType::L {
136 let c = text[j];
137 sa[heads[c]] = j;
138 heads[c] += 1;
139 }
140 }
141}
142
143fn induce_s(
145 sa: &mut [usize],
146 text: &[usize],
147 types: &[SuffixType],
148 bucket_sizes: &[usize],
149) {
150 let n = sa.len();
151 let sentinel = n;
152 let mut tails = get_bucket_ends(bucket_sizes);
153
154 for i in (0..n).rev() {
155 if sa[i] == sentinel || sa[i] == 0 {
156 continue;
157 }
158 let j = sa[i] - 1;
159 if types[j] == SuffixType::S {
160 let c = text[j];
161 tails[c] -= 1;
162 sa[tails[c]] = j;
163 }
164 }
165}
166
167fn lms_substrings_equal(
169 text: &[usize],
170 types: &[SuffixType],
171 pos1: usize,
172 pos2: usize,
173) -> bool {
174 let n = text.len();
175
176 if pos1 >= n || pos2 >= n {
178 return false;
179 }
180
181 let mut i = 0;
182 loop {
183 let end1 = i > 0 && is_lms(types, pos1 + i);
184 let end2 = i > 0 && is_lms(types, pos2 + i);
185
186 if end1 && end2 {
187 return true; }
189 if end1 != end2 {
190 return false; }
192 if text[pos1 + i] != text[pos2 + i] {
193 return false;
194 }
195 if types[pos1 + i] != types[pos2 + i] {
196 return false;
197 }
198
199 i += 1;
200
201 if pos1 + i >= n || pos2 + i >= n {
202 return false;
203 }
204 }
205}
206
207fn sais(text: &[usize], alphabet_size: usize) -> Vec<usize> {
209 let n = text.len();
210
211 if n == 0 {
212 return vec![];
213 }
214 if n == 1 {
215 return vec![0];
216 }
217 if n == 2 {
218 if text[0] <= text[1] {
219 return vec![0, 1];
220 } else {
221 return vec![1, 0];
222 }
223 }
224
225 let types = classify_suffixes(text);
227
228 let lms_positions: Vec<usize> = (0..n).filter(|&i| is_lms(&types, i)).collect();
230
231 let bucket_sizes = get_bucket_sizes(text, alphabet_size);
233
234 let mut sa = vec![0usize; n];
236 place_lms(&mut sa, text, &types, &bucket_sizes, &lms_positions);
237 induce_l(&mut sa, text, &types, &bucket_sizes);
238 induce_s(&mut sa, text, &types, &bucket_sizes);
239
240 let sentinel = n;
242 let sorted_lms: Vec<usize> = sa.iter().copied().filter(|&x| x != sentinel && is_lms(&types, x)).collect();
244
245 let mut names = vec![sentinel; n];
247 let mut current_name = 0;
248
249 if !sorted_lms.is_empty() {
250 names[sorted_lms[0]] = current_name;
251 for i in 1..sorted_lms.len() {
252 if !lms_substrings_equal(text, &types, sorted_lms[i - 1], sorted_lms[i]) {
253 current_name += 1;
254 }
255 names[sorted_lms[i]] = current_name;
256 }
257 }
258
259 let num_names = current_name + 1;
260
261 let lms_count = lms_positions.len();
263 let reduced: Vec<usize> = lms_positions.iter().map(|&pos| names[pos]).collect();
264
265 let sorted_lms_indices = if num_names < lms_count {
266 let sub_sa = sais(&reduced, num_names);
268 sub_sa
269 } else {
270 let mut order = vec![0usize; lms_count];
272 for (i, &name) in reduced.iter().enumerate() {
273 order[name] = i;
274 }
275 order
276 };
277
278 let sorted_lms_positions: Vec<usize> = sorted_lms_indices
280 .iter()
281 .map(|&i| lms_positions[i])
282 .collect();
283
284 place_lms(&mut sa, text, &types, &bucket_sizes, &sorted_lms_positions);
285 induce_l(&mut sa, text, &types, &bucket_sizes);
286 induce_s(&mut sa, text, &types, &bucket_sizes);
287
288 sa
289}
290
291impl SuffixArray {
292 pub fn build(text: &[u8]) -> Self {
309 if text.is_empty() {
310 return Self { sa: vec![0] }; }
312
313 let mut chars_present = [false; 256];
317 for &b in text {
318 chars_present[b as usize] = true;
319 }
320
321 let mut mapping = [0usize; 256];
323 let mut next_id = 1usize;
324 for i in 0..256 {
325 if chars_present[i] {
326 mapping[i] = next_id;
327 next_id += 1;
328 }
329 }
330 let alphabet_size = next_id; let mut int_text: Vec<usize> = Vec::with_capacity(text.len() + 1);
334 for &b in text {
335 int_text.push(mapping[b as usize]);
336 }
337 int_text.push(0); let sa = sais(&int_text, alphabet_size);
340
341 Self { sa }
342 }
343
344 pub fn search(&self, text: &[u8], pattern: &[u8]) -> Vec<usize> {
351 if pattern.is_empty() || text.is_empty() {
352 return vec![];
353 }
354
355 let n = text.len();
356
357 let left = self.lower_bound(text, pattern);
362 let right = self.upper_bound(text, pattern);
363
364 if left >= right {
365 return vec![];
366 }
367
368 let mut positions: Vec<usize> = (left..right)
369 .map(|i| self.sa[i])
370 .filter(|&pos| pos < n) .collect();
372 positions.sort_unstable();
373 positions
374 }
375
376 fn lower_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
378 let mut lo = 0;
379 let mut hi = self.sa.len();
380
381 while lo < hi {
382 let mid = lo + (hi - lo) / 2;
383 let pos = self.sa[mid];
384 let suffix = if pos < text.len() {
385 &text[pos..]
386 } else {
387 &[] };
389
390 if compare_prefix(suffix, pattern) == std::cmp::Ordering::Less {
391 lo = mid + 1;
392 } else {
393 hi = mid;
394 }
395 }
396
397 lo
398 }
399
400 fn upper_bound(&self, text: &[u8], pattern: &[u8]) -> usize {
402 let mut lo = 0;
403 let mut hi = self.sa.len();
404
405 while lo < hi {
406 let mid = lo + (hi - lo) / 2;
407 let pos = self.sa[mid];
408 let suffix = if pos < text.len() {
409 &text[pos..]
410 } else {
411 &[] };
413
414 if compare_prefix(suffix, pattern) == std::cmp::Ordering::Greater {
415 hi = mid;
416 } else {
417 lo = mid + 1;
418 }
419 }
420
421 lo
422 }
423
424 pub fn len(&self) -> usize {
426 self.sa.len()
427 }
428}
429
430fn compare_prefix(suffix: &[u8], pattern: &[u8]) -> std::cmp::Ordering {
434 let len = suffix.len().min(pattern.len());
435 match suffix[..len].cmp(&pattern[..len]) {
436 std::cmp::Ordering::Equal => {
437 if suffix.len() >= pattern.len() {
438 std::cmp::Ordering::Equal
439 } else {
440 std::cmp::Ordering::Less
441 }
442 }
443 other => other,
444 }
445}
446
447#[cfg(test)]
448mod tests {
449 use super::*;
450
451 #[test]
452 fn build_banana() {
453 let text = b"banana";
454 let sa = SuffixArray::build(text);
455 assert_eq!(sa.len(), 7);
457
458 let suffixes: Vec<&[u8]> = sa
460 .sa
461 .iter()
462 .map(|&pos| {
463 if pos < text.len() {
464 &text[pos..]
465 } else {
466 b"" as &[u8] }
468 })
469 .collect();
470
471 for i in 1..suffixes.len() {
472 assert!(
473 suffixes[i - 1] <= suffixes[i],
474 "suffixes not sorted at {}: {:?} > {:?}",
475 i,
476 std::str::from_utf8(suffixes[i - 1]),
477 std::str::from_utf8(suffixes[i])
478 );
479 }
480 }
481
482 #[test]
483 fn search_banana_ana() {
484 let text = b"banana";
485 let sa = SuffixArray::build(text);
486 let mut positions = sa.search(text, b"ana");
487 positions.sort();
488 assert_eq!(positions, vec![1, 3]);
489 }
490
491 #[test]
492 fn search_banana_ban() {
493 let text = b"banana";
494 let sa = SuffixArray::build(text);
495 let positions = sa.search(text, b"ban");
496 assert_eq!(positions, vec![0]);
497 }
498
499 #[test]
500 fn search_banana_a() {
501 let text = b"banana";
502 let sa = SuffixArray::build(text);
503 let mut positions = sa.search(text, b"a");
504 positions.sort();
505 assert_eq!(positions, vec![1, 3, 5]);
506 }
507
508 #[test]
509 fn search_no_match() {
510 let text = b"banana";
511 let sa = SuffixArray::build(text);
512 let positions = sa.search(text, b"xyz");
513 assert!(positions.is_empty());
514 }
515
516 #[test]
517 fn search_full_text() {
518 let text = b"banana";
519 let sa = SuffixArray::build(text);
520 let positions = sa.search(text, b"banana");
521 assert_eq!(positions, vec![0]);
522 }
523
524 #[test]
525 fn search_empty_pattern() {
526 let text = b"banana";
527 let sa = SuffixArray::build(text);
528 let positions = sa.search(text, b"");
529 assert!(positions.is_empty());
530 }
531
532 #[test]
533 fn build_single_char() {
534 let text = b"a";
535 let sa = SuffixArray::build(text);
536 assert_eq!(sa.len(), 2); let positions = sa.search(text, b"a");
538 assert_eq!(positions, vec![0]);
539 }
540
541 #[test]
542 fn build_repeated_chars() {
543 let text = b"aaaa";
544 let sa = SuffixArray::build(text);
545 assert_eq!(sa.len(), 5);
546
547 let suffixes: Vec<&[u8]> = sa
549 .sa
550 .iter()
551 .map(|&pos| {
552 if pos < text.len() {
553 &text[pos..]
554 } else {
555 b"" as &[u8]
556 }
557 })
558 .collect();
559
560 for i in 1..suffixes.len() {
561 assert!(suffixes[i - 1] <= suffixes[i]);
562 }
563
564 let mut positions = sa.search(text, b"aa");
565 positions.sort();
566 assert_eq!(positions, vec![0, 1, 2]);
567 }
568
569 #[test]
570 fn build_dna_sequence() {
571 let text = b"ACGTACGT";
572 let sa = SuffixArray::build(text);
573
574 let mut positions = sa.search(text, b"ACG");
575 positions.sort();
576 assert_eq!(positions, vec![0, 4]);
577
578 let positions = sa.search(text, b"TACG");
579 assert_eq!(positions, vec![3]);
580 }
581
582 #[test]
583 fn build_empty_text() {
584 let text = b"";
585 let sa = SuffixArray::build(text);
586 assert_eq!(sa.len(), 1); let positions = sa.search(text, b"a");
588 assert!(positions.is_empty());
589 }
590
591 #[test]
592 fn search_longer_than_text() {
593 let text = b"abc";
594 let sa = SuffixArray::build(text);
595 let positions = sa.search(text, b"abcdefg");
596 assert!(positions.is_empty());
597 }
598
599 #[test]
600 fn build_mississippi() {
601 let text = b"mississippi";
602 let sa = SuffixArray::build(text);
603
604 let suffixes: Vec<&[u8]> = sa
606 .sa
607 .iter()
608 .map(|&pos| {
609 if pos < text.len() {
610 &text[pos..]
611 } else {
612 b"" as &[u8]
613 }
614 })
615 .collect();
616 for i in 1..suffixes.len() {
617 assert!(
618 suffixes[i - 1] <= suffixes[i],
619 "not sorted at {}: {:?} vs {:?}",
620 i,
621 std::str::from_utf8(suffixes[i - 1]),
622 std::str::from_utf8(suffixes[i])
623 );
624 }
625
626 let mut positions = sa.search(text, b"issi");
627 positions.sort();
628 assert_eq!(positions, vec![1, 4]);
629
630 let mut positions = sa.search(text, b"ss");
631 positions.sort();
632 assert_eq!(positions, vec![2, 5]);
633 }
634}