Skip to main content

libsais_rs/
libsais64.rs

1//! Rust translation of upstream [libsais](https://github.com/IlyaGrebnov/libsais)
2//! 2.10.4 by Ilya Grebnov.
3//!
4//! This module exposes the 64-bit suffix array, BWT, unBWT, PLCP and LCP entry
5//! points (mirroring `libsais64.h`).
6
7use std::marker::PhantomData;
8use std::mem;
9#[cfg(feature = "upstream-c")]
10use std::mem::MaybeUninit;
11
12use crate::run_rayon_with_threads;
13use rayon::prelude::*;
14
15pub type SaSint = i64;
16pub type SaUint = u64;
17pub type FastSint = isize;
18pub type FastUint = usize;
19
20pub const SAINT_BIT: u32 = 64;
21pub const SAINT_MAX: SaSint = i64::MAX;
22pub const SAINT_MIN: SaSint = i64::MIN;
23
24pub const ALPHABET_SIZE: usize = 1usize << 8;
25pub const UNBWT_FASTBITS: usize = 17;
26
27pub const SUFFIX_GROUP_BIT: u32 = SAINT_BIT - 1;
28pub const SUFFIX_GROUP_MARKER: SaSint = 1_i64 << (SUFFIX_GROUP_BIT - 1);
29
30pub const LIBSAIS_LOCAL_BUFFER_SIZE: usize = 1000;
31pub const LIBSAIS_PER_THREAD_CACHE_SIZE: usize = 24_576;
32
33pub const LIBSAIS_FLAGS_NONE: SaSint = 0;
34pub const LIBSAIS_FLAGS_BWT: SaSint = 1;
35pub const LIBSAIS_FLAGS_GSA: SaSint = 2;
36
37#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
38pub struct ThreadCache {
39    pub symbol: SaSint,
40    pub index: SaSint,
41}
42
43#[derive(Clone, Debug, PartialEq, Eq)]
44pub struct ThreadState {
45    pub position: FastSint,
46    pub count: FastSint,
47    pub m: FastSint,
48    pub last_lms_suffix: FastSint,
49    pub buckets: Vec<SaSint>,
50    pub cache: Vec<ThreadCache>,
51}
52
53impl ThreadState {
54    fn new() -> Self {
55        Self {
56            position: 0,
57            count: 0,
58            m: 0,
59            last_lms_suffix: 0,
60            buckets: vec![0; 4 * ALPHABET_SIZE],
61            cache: vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE],
62        }
63    }
64}
65
66#[derive(Clone, Debug, PartialEq, Eq)]
67pub struct Context {
68    pub buckets: Vec<SaSint>,
69    pub thread_state: Option<Vec<ThreadState>>,
70    pub threads: FastSint,
71}
72
73#[derive(Clone, Debug, PartialEq, Eq)]
74pub struct UnbwtContext {
75    pub bucket2: Vec<SaUint>,
76    pub fastbits: Vec<u16>,
77    pub buckets: Option<Vec<SaUint>>,
78    pub threads: FastSint,
79}
80
81/// Internal helper: buckets index2.
82#[doc(hidden)]
83pub fn buckets_index2(c: FastUint, s: FastUint) -> FastUint {
84    (c << 1) + s
85}
86
87/// Internal helper: buckets index4.
88#[doc(hidden)]
89pub fn buckets_index4(c: FastUint, s: FastUint) -> FastUint {
90    (c << 2) + s
91}
92
93/// Internal helper: align up.
94#[doc(hidden)]
95pub fn align_up(value: usize, alignment: usize) -> usize {
96    debug_assert!(alignment.is_power_of_two());
97    (value + alignment - 1) & !(alignment - 1)
98}
99
100/// Internal helper: alloc thread state.
101#[doc(hidden)]
102pub fn alloc_thread_state(threads: SaSint) -> Option<Vec<ThreadState>> {
103    if threads <= 0 {
104        return None;
105    }
106
107    let len = usize::try_from(threads).ok()?;
108    Some((0..len).map(|_| ThreadState::new()).collect())
109}
110
111/// Internal helper: create ctx main.
112#[doc(hidden)]
113pub fn create_ctx_main(threads: SaSint) -> Option<Context> {
114    if threads <= 0 {
115        return None;
116    }
117
118    let thread_state = if threads > 1 {
119        Some(alloc_thread_state(threads)?)
120    } else {
121        None
122    };
123
124    Some(Context {
125        buckets: vec![0; 8 * ALPHABET_SIZE],
126        thread_state,
127        threads: threads as FastSint,
128    })
129}
130
131/// Creates the libsais64 context that allows reusing allocated memory with each libsais64 operation.
132///
133/// In multi-threaded environments, use one context per thread for parallel executions.
134///
135/// Returns the context, or `None` on allocation failure.
136pub fn create_ctx() -> Option<Context> {
137    create_ctx_main(1)
138}
139
140/// Destroys the libsais64 context and frees previously allocated memory.
141pub fn free_ctx(_ctx: Context) {}
142
143/// Internal helper: unbwt create ctx main.
144#[doc(hidden)]
145pub fn unbwt_create_ctx_main(threads: SaSint) -> Option<UnbwtContext> {
146    if threads <= 0 {
147        return None;
148    }
149
150    let buckets = if threads > 1 {
151        let len = usize::try_from(threads).ok()? * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE);
152        Some(vec![0; len])
153    } else {
154        None
155    };
156
157    Some(UnbwtContext {
158        bucket2: vec![0; ALPHABET_SIZE * ALPHABET_SIZE],
159        fastbits: vec![0; 1 + (1 << UNBWT_FASTBITS)],
160        buckets,
161        threads: threads as FastSint,
162    })
163}
164
165/// Internal helper: unbwt free ctx main.
166#[doc(hidden)]
167pub fn unbwt_free_ctx_main(_ctx: UnbwtContext) {}
168
169/// Creates the libsais64 reverse-BWT context that allows reusing allocated memory with each `libsais64_unbwt_*` operation.
170///
171/// In multi-threaded environments, use one context per thread for parallel executions.
172///
173/// Returns the context, or `None` on allocation failure.
174pub fn unbwt_create_ctx() -> Option<UnbwtContext> {
175    unbwt_create_ctx_main(1)
176}
177
178/// Destroys the libsais64 reverse-BWT context and frees previously allocated memory.
179pub fn unbwt_free_ctx(_ctx: UnbwtContext) {}
180
181/// Internal helper: count negative marked suffixes.
182#[doc(hidden)]
183pub fn count_negative_marked_suffixes(
184    sa: &[SaSint],
185    block_start: FastSint,
186    block_size: FastSint,
187) -> SaSint {
188    block_slice(sa, block_start, block_size)
189        .iter()
190        .map(|&value| SaSint::from(value < 0))
191        .sum()
192}
193
194/// Internal helper: count zero marked suffixes.
195#[doc(hidden)]
196pub fn count_zero_marked_suffixes(
197    sa: &[SaSint],
198    block_start: FastSint,
199    block_size: FastSint,
200) -> SaSint {
201    block_slice(sa, block_start, block_size)
202        .iter()
203        .map(|&value| SaSint::from(value == 0))
204        .sum()
205}
206
207/// Internal helper: place cached suffixes.
208#[doc(hidden)]
209pub fn place_cached_suffixes(
210    sa: &mut [SaSint],
211    cache: &[ThreadCache],
212    block_start: FastSint,
213    block_size: FastSint,
214) {
215    let start = usize::try_from(block_start).expect("block_start must be non-negative");
216    let len = usize::try_from(block_size).expect("block_size must be non-negative");
217    let entries = if cache.len() >= start + len {
218        &cache[start..start + len]
219    } else {
220        &cache[..len]
221    };
222
223    for entry in entries {
224        let slot = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
225        sa[slot] = entry.index;
226    }
227}
228
229/// Internal helper: compact and place cached suffixes.
230#[doc(hidden)]
231pub fn compact_and_place_cached_suffixes(
232    sa: &mut [SaSint],
233    cache: &mut [ThreadCache],
234    block_start: FastSint,
235    block_size: FastSint,
236) {
237    let start = usize::try_from(block_start).expect("block_start must be non-negative");
238    let len = usize::try_from(block_size).expect("block_size must be non-negative");
239    let read_start = if cache.len() >= start + len { start } else { 0 };
240    let read_end = read_start + len;
241
242    let mut write = read_start;
243    for read in read_start..read_end {
244        let entry = cache[read];
245        if entry.symbol >= 0 {
246            cache[write] = entry;
247            write += 1;
248        }
249    }
250
251    place_cached_suffixes(sa, cache, block_start, (write - read_start) as FastSint);
252}
253
254/// Internal helper: flip suffix markers (OpenMP variant).
255#[doc(hidden)]
256pub fn flip_suffix_markers_omp(sa: &mut [SaSint], l: SaSint, threads: SaSint) {
257    let len = usize::try_from(l).expect("l must be non-negative");
258    let omp_num_threads = if threads > 1 && l >= 65_536 {
259        usize::try_from(threads).expect("threads must be non-negative")
260    } else {
261        1
262    };
263    if omp_num_threads > 1 {
264        let chunk_size = ((len / omp_num_threads) & !15usize).max(16);
265        run_rayon_with_threads(omp_num_threads, || {
266            sa[..len].par_chunks_mut(chunk_size).for_each(|chunk| {
267                for value in chunk {
268                    *value ^= SAINT_MIN;
269                }
270            });
271        });
272        return;
273    }
274
275    let omp_block_stride = (len / omp_num_threads) & !15usize;
276    for omp_thread_num in 0..omp_num_threads {
277        let omp_block_start = omp_thread_num * omp_block_stride;
278        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
279            omp_block_stride
280        } else {
281            len - omp_block_start
282        };
283        for value in &mut sa[omp_block_start..omp_block_start + omp_block_size] {
284            *value ^= SAINT_MIN;
285        }
286    }
287}
288
289/// Internal helper: gather lms suffixes 8u.
290#[doc(hidden)]
291pub fn gather_lms_suffixes_8u(
292    t: &[u8],
293    sa: &mut [SaSint],
294    n: SaSint,
295    mut m: FastSint,
296    omp_block_start: FastSint,
297    omp_block_size: FastSint,
298) {
299    if omp_block_size <= 0 {
300        return;
301    }
302
303    let n = usize::try_from(n).expect("n must be non-negative");
304    let block_start =
305        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
306    let block_size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
307
308    let mut j = block_start + block_size;
309    let mut c0 = t[block_start + block_size - 1] as FastSint;
310    let mut c1 = -1;
311    while j < n {
312        c1 = t[j] as FastSint;
313        if c1 != c0 {
314            break;
315        }
316        j += 1;
317    }
318
319    let mut f0 = usize::from(c0 >= c1);
320    let mut f1: usize;
321    let mut i = block_start + block_size - 2;
322    let limit = block_start + 3;
323
324    while i >= limit {
325        c1 = t[i] as FastSint;
326        f1 = usize::from(c1 > (c0 - f0 as FastSint));
327        sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
328        m -= (f1 & !f0) as FastSint;
329
330        c0 = t[i - 1] as FastSint;
331        f0 = usize::from(c0 > (c1 - f1 as FastSint));
332        sa[usize::try_from(m).expect("m must be non-negative")] = i as SaSint;
333        m -= (f0 & !f1) as FastSint;
334
335        c1 = t[i - 2] as FastSint;
336        f1 = usize::from(c1 > (c0 - f0 as FastSint));
337        sa[usize::try_from(m).expect("m must be non-negative")] = (i - 1) as SaSint;
338        m -= (f1 & !f0) as FastSint;
339
340        c0 = t[i - 3] as FastSint;
341        f0 = usize::from(c0 > (c1 - f1 as FastSint));
342        sa[usize::try_from(m).expect("m must be non-negative")] = (i - 2) as SaSint;
343        m -= (f0 & !f1) as FastSint;
344
345        if i < 4 {
346            break;
347        }
348        i -= 4;
349    }
350
351    let tail_limit = limit - 3;
352    while i >= tail_limit {
353        c1 = c0;
354        c0 = t[i] as FastSint;
355        f1 = f0;
356        f0 = usize::from(c0 > (c1 - f1 as FastSint));
357        sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
358        m -= (f0 & !f1) as FastSint;
359        if i == 0 {
360            break;
361        }
362        i -= 1;
363    }
364
365    sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
366}
367
368/// Internal helper: gather lms suffixes 8u (OpenMP variant).
369#[doc(hidden)]
370pub fn gather_lms_suffixes_8u_omp(
371    t: &[u8],
372    sa: &mut [SaSint],
373    n: SaSint,
374    threads: SaSint,
375    thread_state: &mut [ThreadState],
376) {
377    let n_usize = usize::try_from(n).expect("n must be non-negative");
378    let omp_num_threads = if threads > 1 && n >= 65_536 {
379        usize::try_from(threads)
380            .expect("threads must be non-negative")
381            .min(thread_state.len())
382            .max(1)
383    } else {
384        1
385    };
386    if omp_num_threads == 1 {
387        gather_lms_suffixes_8u(t, sa, n, n as FastSint - 1, 0, n as FastSint);
388        return;
389    }
390
391    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
392    let mut suffix_counts_after = vec![0 as FastSint; omp_num_threads];
393    let mut m = 0 as FastSint;
394    for omp_thread_num in (0..omp_num_threads).rev() {
395        suffix_counts_after[omp_thread_num] = m;
396        m += thread_state[omp_thread_num].m;
397    }
398
399    for omp_thread_num in 0..omp_num_threads {
400        let omp_block_start = omp_thread_num * omp_block_stride;
401        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
402            omp_block_stride
403        } else {
404            n_usize - omp_block_start
405        };
406        gather_lms_suffixes_8u(
407            t,
408            sa,
409            n,
410            n as FastSint - 1 - suffix_counts_after[omp_thread_num],
411            omp_block_start as FastSint,
412            omp_block_size as FastSint,
413        );
414    }
415
416    for omp_thread_num in 0..omp_num_threads {
417        if thread_state[omp_thread_num].m > 0 {
418            let dst = usize::try_from(n as FastSint - 1 - suffix_counts_after[omp_thread_num])
419                .expect("destination must be non-negative");
420            sa[dst] = thread_state[omp_thread_num].last_lms_suffix as SaSint;
421        }
422    }
423}
424
425/// Internal helper: gather lms suffixes 32s.
426#[doc(hidden)]
427pub fn gather_lms_suffixes_32s(t: &[SaSint], sa: &mut [SaSint], n: SaSint) -> SaSint {
428    let n_usize = usize::try_from(n).expect("n must be non-negative");
429    let mut i = n as FastSint - 2;
430    let mut m = n_usize - 1;
431    let mut f0 = 1usize;
432    let mut f1: usize;
433    let mut c0 = t[n_usize - 1] as FastSint;
434    let mut c1: FastSint;
435
436    while i >= 3 {
437        c1 = t[i as usize] as FastSint;
438        f1 = usize::from(c1 > (c0 - f0 as FastSint));
439        sa[m] = (i + 1) as SaSint;
440        m -= f1 & !f0;
441
442        c0 = t[(i - 1) as usize] as FastSint;
443        f0 = usize::from(c0 > (c1 - f1 as FastSint));
444        sa[m] = i as SaSint;
445        m -= f0 & !f1;
446
447        c1 = t[(i - 2) as usize] as FastSint;
448        f1 = usize::from(c1 > (c0 - f0 as FastSint));
449        sa[m] = (i - 1) as SaSint;
450        m -= f1 & !f0;
451
452        c0 = t[(i - 3) as usize] as FastSint;
453        f0 = usize::from(c0 > (c1 - f1 as FastSint));
454        sa[m] = (i - 2) as SaSint;
455        m -= f0 & !f1;
456
457        i -= 4;
458    }
459
460    while i >= 0 {
461        c1 = c0;
462        c0 = t[i as usize] as FastSint;
463        f1 = f0;
464        f0 = usize::from(c0 > (c1 - f1 as FastSint));
465        sa[m] = (i + 1) as SaSint;
466        m -= f0 & !f1;
467        i -= 1;
468    }
469
470    (n_usize - 1 - m) as SaSint
471}
472
473/// Internal helper: gather compacted lms suffixes 32s.
474#[doc(hidden)]
475pub fn gather_compacted_lms_suffixes_32s(t: &[SaSint], sa: &mut [SaSint], n: SaSint) -> SaSint {
476    let n_usize = usize::try_from(n).expect("n must be non-negative");
477    let mut i = n as FastSint - 2;
478    let mut m = n_usize - 1;
479    let mut f0 = 1usize;
480    let mut f1: usize;
481    let mut c0 = t[n_usize - 1] as FastSint;
482    let mut c1: FastSint;
483
484    while i >= 3 {
485        c1 = t[i as usize] as FastSint;
486        f1 = usize::from(c1 > (c0 - f0 as FastSint));
487        sa[m] = (i + 1) as SaSint;
488        m -= f1 & !f0 & usize::from(c0 >= 0);
489
490        c0 = t[(i - 1) as usize] as FastSint;
491        f0 = usize::from(c0 > (c1 - f1 as FastSint));
492        sa[m] = i as SaSint;
493        m -= f0 & !f1 & usize::from(c1 >= 0);
494
495        c1 = t[(i - 2) as usize] as FastSint;
496        f1 = usize::from(c1 > (c0 - f0 as FastSint));
497        sa[m] = (i - 1) as SaSint;
498        m -= f1 & !f0 & usize::from(c0 >= 0);
499
500        c0 = t[(i - 3) as usize] as FastSint;
501        f0 = usize::from(c0 > (c1 - f1 as FastSint));
502        sa[m] = (i - 2) as SaSint;
503        m -= f0 & !f1 & usize::from(c1 >= 0);
504
505        i -= 4;
506    }
507
508    while i >= 0 {
509        c1 = c0;
510        c0 = t[i as usize] as FastSint;
511        f1 = f0;
512        f0 = usize::from(c0 > (c1 - f1 as FastSint));
513        sa[m] = (i + 1) as SaSint;
514        m -= f0 & !f1 & usize::from(c1 >= 0);
515        i -= 1;
516    }
517
518    (n_usize - 1 - m) as SaSint
519}
520
521/// Internal helper: count lms suffixes 32s 4k.
522#[doc(hidden)]
523pub fn count_lms_suffixes_32s_4k(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
524    buckets.fill(0);
525    let n_usize = usize::try_from(n).expect("n must be non-negative");
526    let _k_usize = usize::try_from(k).expect("k must be non-negative");
527    let mut i = n as FastSint - 2;
528    let mut f0 = 1usize;
529    let mut f1: usize;
530    let mut c0 = t[n_usize - 1] as FastSint;
531    let mut c1: FastSint;
532
533    while i >= 3 {
534        c1 = t[i as usize] as FastSint;
535        f1 = usize::from(c1 > (c0 - f0 as FastSint));
536        buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
537
538        c0 = t[(i - 1) as usize] as FastSint;
539        f0 = usize::from(c0 > (c1 - f1 as FastSint));
540        buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
541
542        c1 = t[(i - 2) as usize] as FastSint;
543        f1 = usize::from(c1 > (c0 - f0 as FastSint));
544        buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
545
546        c0 = t[(i - 3) as usize] as FastSint;
547        f0 = usize::from(c0 > (c1 - f1 as FastSint));
548        buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
549
550        i -= 4;
551    }
552
553    while i >= 0 {
554        c1 = c0;
555        c0 = t[i as usize] as FastSint;
556        f1 = f0;
557        f0 = usize::from(c0 > (c1 - f1 as FastSint));
558        buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
559        i -= 1;
560    }
561
562    buckets[buckets_index4(c0 as usize, f0 + f0)] += 1;
563}
564
565/// Internal helper: count lms suffixes 32s 2k.
566#[doc(hidden)]
567pub fn count_lms_suffixes_32s_2k(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
568    buckets.fill(0);
569    let n_usize = usize::try_from(n).expect("n must be non-negative");
570    let _k_usize = usize::try_from(k).expect("k must be non-negative");
571    let mut i = n as FastSint - 2;
572    let mut f0 = 1usize;
573    let mut f1: usize;
574    let mut c0 = t[n_usize - 1] as FastSint;
575    let mut c1: FastSint;
576
577    while i >= 3 {
578        c1 = t[i as usize] as FastSint;
579        f1 = usize::from(c1 > (c0 - f0 as FastSint));
580        buckets[buckets_index2(c0 as usize, f1 & !f0)] += 1;
581
582        c0 = t[(i - 1) as usize] as FastSint;
583        f0 = usize::from(c0 > (c1 - f1 as FastSint));
584        buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
585
586        c1 = t[(i - 2) as usize] as FastSint;
587        f1 = usize::from(c1 > (c0 - f0 as FastSint));
588        buckets[buckets_index2(c0 as usize, f1 & !f0)] += 1;
589
590        c0 = t[(i - 3) as usize] as FastSint;
591        f0 = usize::from(c0 > (c1 - f1 as FastSint));
592        buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
593
594        i -= 4;
595    }
596
597    while i >= 0 {
598        c1 = c0;
599        c0 = t[i as usize] as FastSint;
600        f1 = f0;
601        f0 = usize::from(c0 > (c1 - f1 as FastSint));
602        buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
603        i -= 1;
604    }
605
606    buckets[buckets_index2(c0 as usize, 0)] += 1;
607}
608
609/// Internal helper: count compacted lms suffixes 32s 2k.
610#[doc(hidden)]
611pub fn count_compacted_lms_suffixes_32s_2k(
612    t: &[SaSint],
613    n: SaSint,
614    k: SaSint,
615    buckets: &mut [SaSint],
616) {
617    buckets.fill(0);
618    let n_usize = usize::try_from(n).expect("n must be non-negative");
619    let _k_usize = usize::try_from(k).expect("k must be non-negative");
620    let mut i = n as FastSint - 2;
621    let mut f0 = 1usize;
622    let mut f1: usize;
623    let mut c0 = t[n_usize - 1] as FastSint;
624    let mut c1: FastSint;
625
626    while i >= 3 {
627        c1 = t[i as usize] as FastSint;
628        f1 = usize::from(c1 > (c0 - f0 as FastSint));
629        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
630
631        c0 = t[(i - 1) as usize] as FastSint;
632        f0 = usize::from(c0 > (c1 - f1 as FastSint));
633        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
634
635        c1 = t[(i - 2) as usize] as FastSint;
636        f1 = usize::from(c1 > (c0 - f0 as FastSint));
637        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
638
639        c0 = t[(i - 3) as usize] as FastSint;
640        f0 = usize::from(c0 > (c1 - f1 as FastSint));
641        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
642
643        i -= 4;
644    }
645
646    while i >= 0 {
647        c1 = c0;
648        c0 = t[i as usize] as FastSint;
649        f1 = f0;
650        f0 = usize::from(c0 > (c1 - f1 as FastSint));
651        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
652        i -= 1;
653    }
654
655    buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, 0)] += 1;
656}
657
658/// Internal helper: count and gather lms suffixes 8u.
659#[doc(hidden)]
660pub fn count_and_gather_lms_suffixes_8u(
661    t: &[u8],
662    sa: &mut [SaSint],
663    n: SaSint,
664    buckets: &mut [SaSint],
665    omp_block_start: FastSint,
666    omp_block_size: FastSint,
667) -> SaSint {
668    buckets.fill(0);
669    let n = n as FastSint;
670    let mut m = omp_block_start + omp_block_size - 1;
671
672    if omp_block_size > 0 {
673        let prefetch_distance = 256 as FastSint;
674        let mut j = m + 1;
675        let mut c0 = t[m as usize] as FastSint;
676        let mut c1 = -1;
677        while j < n {
678            c1 = t[j as usize] as FastSint;
679            if c1 != c0 {
680                break;
681            }
682            j += 1;
683        }
684
685        let mut f0 = usize::from(c0 >= c1);
686        let mut f1: usize;
687        let mut i = m - 1;
688        let limit = omp_block_start + 3;
689
690        while i >= limit {
691            let _prefetch_index = i - prefetch_distance;
692            c1 = t[i as usize] as FastSint;
693            f1 = usize::from(c1 > (c0 - f0 as FastSint));
694            sa[m as usize] = (i + 1) as SaSint;
695            m -= (f1 & !f0) as FastSint;
696            buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
697
698            c0 = t[(i - 1) as usize] as FastSint;
699            f0 = usize::from(c0 > (c1 - f1 as FastSint));
700            sa[m as usize] = i as SaSint;
701            m -= (f0 & !f1) as FastSint;
702            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
703
704            c1 = t[(i - 2) as usize] as FastSint;
705            f1 = usize::from(c1 > (c0 - f0 as FastSint));
706            sa[m as usize] = (i - 1) as SaSint;
707            m -= (f1 & !f0) as FastSint;
708            buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
709
710            c0 = t[(i - 3) as usize] as FastSint;
711            f0 = usize::from(c0 > (c1 - f1 as FastSint));
712            sa[m as usize] = (i - 2) as SaSint;
713            m -= (f0 & !f1) as FastSint;
714            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
715
716            i -= 4;
717        }
718
719        let tail_limit = limit - 3;
720        while i >= tail_limit {
721            c1 = c0;
722            c0 = t[i as usize] as FastSint;
723            f1 = f0;
724            f0 = usize::from(c0 > (c1 - f1 as FastSint));
725            sa[m as usize] = (i + 1) as SaSint;
726            m -= (f0 & !f1) as FastSint;
727            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
728            i -= 1;
729        }
730
731        c1 = if i >= 0 {
732            t[i as usize] as FastSint
733        } else {
734            -1
735        };
736        f1 = usize::from(c1 > (c0 - f0 as FastSint));
737        sa[m as usize] = (i + 1) as SaSint;
738        m -= (f1 & !f0) as FastSint;
739        buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
740    }
741
742    (omp_block_start + omp_block_size - 1 - m) as SaSint
743}
744
745/// Internal helper: count and gather lms suffixes 8u (OpenMP variant).
746#[doc(hidden)]
747pub fn count_and_gather_lms_suffixes_8u_omp(
748    t: &[u8],
749    sa: &mut [SaSint],
750    n: SaSint,
751    buckets: &mut [SaSint],
752    threads: SaSint,
753    thread_state: &mut [ThreadState],
754) -> SaSint {
755    let mut m = 0;
756    let n_usize = usize::try_from(n).expect("n must be non-negative");
757    let omp_num_threads = if threads > 1 && n >= 65_536 {
758        usize::try_from(threads)
759            .expect("threads must be non-negative")
760            .min(thread_state.len())
761            .max(1)
762    } else {
763        1
764    };
765    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
766
767    if omp_num_threads == 1 {
768        return count_and_gather_lms_suffixes_8u(t, sa, n, buckets, 0, n as FastSint);
769    }
770
771    for omp_thread_num in 0..omp_num_threads {
772        let omp_block_start = omp_thread_num * omp_block_stride;
773        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
774            omp_block_stride
775        } else {
776            n_usize - omp_block_start
777        };
778
779        let state = &mut thread_state[omp_thread_num];
780        state.position = FastSint::try_from(omp_block_start + omp_block_size)
781            .expect("position must fit FastSint");
782        state.m = FastSint::try_from(count_and_gather_lms_suffixes_8u(
783            t,
784            sa,
785            n,
786            &mut state.buckets,
787            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
788            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
789        ))
790        .expect("m must fit FastSint");
791
792        if state.m > 0 {
793            let position = usize::try_from(state.position).expect("position must be non-negative");
794            state.last_lms_suffix =
795                FastSint::try_from(sa[position - 1]).expect("suffix must fit FastSint");
796        }
797    }
798
799    buckets.fill(0);
800
801    for tnum in (0..omp_num_threads).rev() {
802        let state = &mut thread_state[tnum];
803        m += SaSint::try_from(state.m).expect("m must fit SaSint");
804
805        if tnum + 1 < omp_num_threads && state.m > 0 {
806            let position = usize::try_from(state.position).expect("position must be non-negative");
807            let count = usize::try_from(state.m).expect("m must be non-negative");
808            let dst = n_usize - usize::try_from(m).expect("m must be non-negative");
809            sa.copy_within(position - count..position, dst);
810        }
811
812        for s in 0..4 * ALPHABET_SIZE {
813            let a = buckets[s];
814            let b = state.buckets[s];
815            buckets[s] = a + b;
816            state.buckets[s] = a;
817        }
818    }
819
820    m
821}
822
823/// Internal helper: count and gather lms suffixes 32s 4k.
824#[doc(hidden)]
825pub fn count_and_gather_lms_suffixes_32s_4k(
826    t: &[SaSint],
827    sa: &mut [SaSint],
828    n: SaSint,
829    k: SaSint,
830    buckets: &mut [SaSint],
831    omp_block_start: FastSint,
832    omp_block_size: FastSint,
833) -> SaSint {
834    buckets.fill(0);
835    let n = n as FastSint;
836    let _k = k as FastSint;
837    let mut m = omp_block_start + omp_block_size - 1;
838
839    if omp_block_size > 0 {
840        let prefetch_distance = 64 as FastSint;
841        let mut j = m + 1;
842        let mut c0 = t[m as usize] as FastSint;
843        let mut c1 = -1;
844
845        while j < n {
846            c1 = t[j as usize] as FastSint;
847            if c1 != c0 {
848                break;
849            }
850            j += 1;
851        }
852
853        let mut f0 = usize::from(c0 >= c1);
854        let mut f1: usize;
855        let mut i = m - 1;
856        let limit = omp_block_start + prefetch_distance + 3;
857
858        while i >= limit {
859            let _prefetch_index = i - 2 * prefetch_distance;
860            c1 = t[i as usize] as FastSint;
861            f1 = usize::from(c1 > (c0 - f0 as FastSint));
862            sa[m as usize] = (i + 1) as SaSint;
863            m -= (f1 & !f0) as FastSint;
864            buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
865
866            c0 = t[(i - 1) as usize] as FastSint;
867            f0 = usize::from(c0 > (c1 - f1 as FastSint));
868            sa[m as usize] = i as SaSint;
869            m -= (f0 & !f1) as FastSint;
870            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
871
872            c1 = t[(i - 2) as usize] as FastSint;
873            f1 = usize::from(c1 > (c0 - f0 as FastSint));
874            sa[m as usize] = (i - 1) as SaSint;
875            m -= (f1 & !f0) as FastSint;
876            buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
877
878            c0 = t[(i - 3) as usize] as FastSint;
879            f0 = usize::from(c0 > (c1 - f1 as FastSint));
880            sa[m as usize] = (i - 2) as SaSint;
881            m -= (f0 & !f1) as FastSint;
882            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
883
884            i -= 4;
885        }
886
887        let tail_limit = omp_block_start;
888        while i >= tail_limit {
889            c1 = c0;
890            c0 = t[i as usize] as FastSint;
891            f1 = f0;
892            f0 = usize::from(c0 > (c1 - f1 as FastSint));
893            sa[m as usize] = (i + 1) as SaSint;
894            m -= (f0 & !f1) as FastSint;
895            buckets[buckets_index4(c1 as usize, f1 + f1 + f0)] += 1;
896            i -= 1;
897        }
898
899        c1 = if i >= 0 {
900            t[i as usize] as FastSint
901        } else {
902            -1
903        };
904        f1 = usize::from(c1 > (c0 - f0 as FastSint));
905        sa[m as usize] = (i + 1) as SaSint;
906        m -= (f1 & !f0) as FastSint;
907        buckets[buckets_index4(c0 as usize, f0 + f0 + f1)] += 1;
908    }
909
910    (omp_block_start + omp_block_size - 1 - m) as SaSint
911}
912
913/// Internal helper: count and gather lms suffixes 32s 2k.
914#[doc(hidden)]
915pub fn count_and_gather_lms_suffixes_32s_2k(
916    t: &[SaSint],
917    sa: &mut [SaSint],
918    n: SaSint,
919    k: SaSint,
920    buckets: &mut [SaSint],
921    omp_block_start: FastSint,
922    omp_block_size: FastSint,
923) -> SaSint {
924    buckets.fill(0);
925    let n = n as FastSint;
926    let _k = k as FastSint;
927    let mut m = omp_block_start + omp_block_size - 1;
928
929    if omp_block_size > 0 {
930        let prefetch_distance = 64 as FastSint;
931        let mut j = m + 1;
932        let mut c0 = t[m as usize] as FastSint;
933        let mut c1 = -1;
934
935        while j < n {
936            c1 = t[j as usize] as FastSint;
937            if c1 != c0 {
938                break;
939            }
940            j += 1;
941        }
942
943        let mut f0 = usize::from(c0 >= c1);
944        let mut f1: usize;
945        let mut i = m - 1;
946        let limit = omp_block_start + prefetch_distance + 3;
947
948        while i >= limit {
949            let _prefetch_index = i - 2 * prefetch_distance;
950            c1 = t[i as usize] as FastSint;
951            f1 = usize::from(c1 > (c0 - f0 as FastSint));
952            sa[m as usize] = (i + 1) as SaSint;
953            m -= (f1 & !f0) as FastSint;
954            buckets[buckets_index2(c0 as usize, f1 & !f0)] += 1;
955
956            c0 = t[(i - 1) as usize] as FastSint;
957            f0 = usize::from(c0 > (c1 - f1 as FastSint));
958            sa[m as usize] = i as SaSint;
959            m -= (f0 & !f1) as FastSint;
960            buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
961
962            c1 = t[(i - 2) as usize] as FastSint;
963            f1 = usize::from(c1 > (c0 - f0 as FastSint));
964            sa[m as usize] = (i - 1) as SaSint;
965            m -= (f1 & !f0) as FastSint;
966            buckets[buckets_index2(c0 as usize, f1 & !f0)] += 1;
967
968            c0 = t[(i - 3) as usize] as FastSint;
969            f0 = usize::from(c0 > (c1 - f1 as FastSint));
970            sa[m as usize] = (i - 2) as SaSint;
971            m -= (f0 & !f1) as FastSint;
972            buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
973
974            i -= 4;
975        }
976
977        let tail_limit = omp_block_start;
978        while i >= tail_limit {
979            c1 = c0;
980            c0 = t[i as usize] as FastSint;
981            f1 = f0;
982            f0 = usize::from(c0 > (c1 - f1 as FastSint));
983            sa[m as usize] = (i + 1) as SaSint;
984            m -= (f0 & !f1) as FastSint;
985            buckets[buckets_index2(c1 as usize, f0 & !f1)] += 1;
986            i -= 1;
987        }
988
989        c1 = if i >= 0 {
990            t[i as usize] as FastSint
991        } else {
992            -1
993        };
994        f1 = usize::from(c1 > (c0 - f0 as FastSint));
995        sa[m as usize] = (i + 1) as SaSint;
996        m -= (f1 & !f0) as FastSint;
997        buckets[buckets_index2(c0 as usize, f1 & !f0)] += 1;
998    }
999
1000    (omp_block_start + omp_block_size - 1 - m) as SaSint
1001}
1002
1003/// Internal helper: count and gather compacted lms suffixes 32s 2k.
1004#[doc(hidden)]
1005pub fn count_and_gather_compacted_lms_suffixes_32s_2k(
1006    t: &[SaSint],
1007    sa: &mut [SaSint],
1008    n: SaSint,
1009    k: SaSint,
1010    buckets: &mut [SaSint],
1011    omp_block_start: FastSint,
1012    omp_block_size: FastSint,
1013) -> SaSint {
1014    buckets.fill(0);
1015    let n_usize = usize::try_from(n).expect("n must be non-negative");
1016    let _k_usize = usize::try_from(k).expect("k must be non-negative");
1017    let block_start =
1018        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
1019    let block_size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
1020    let mut m = block_start + block_size - 1;
1021
1022    if omp_block_size > 0 {
1023        let mut j = m + 1;
1024        let mut c0 = t[m] as FastSint;
1025        let mut c1 = -1;
1026
1027        while j < n_usize {
1028            c1 = t[j] as FastSint;
1029            if c1 != c0 {
1030                break;
1031            }
1032            j += 1;
1033        }
1034
1035        let mut f0 = usize::from(c0 >= c1);
1036        let mut f1: usize;
1037        let mut i = m as FastSint - 1;
1038        let limit = block_start as FastSint + 3;
1039
1040        while i >= limit {
1041            c1 = t[i as usize] as FastSint;
1042            f1 = usize::from(c1 > (c0 - f0 as FastSint));
1043            sa[m] = (i + 1) as SaSint;
1044            m -= f1 & !f0 & usize::from(c0 >= 0);
1045            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1046
1047            c0 = t[(i - 1) as usize] as FastSint;
1048            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1049            sa[m] = i as SaSint;
1050            m -= f0 & !f1 & usize::from(c1 >= 0);
1051            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1052
1053            c1 = t[(i - 2) as usize] as FastSint;
1054            f1 = usize::from(c1 > (c0 - f0 as FastSint));
1055            sa[m] = (i - 1) as SaSint;
1056            m -= f1 & !f0 & usize::from(c0 >= 0);
1057            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1058
1059            c0 = t[(i - 3) as usize] as FastSint;
1060            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1061            sa[m] = (i - 2) as SaSint;
1062            m -= f0 & !f1 & usize::from(c1 >= 0);
1063            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1064
1065            i -= 4;
1066        }
1067
1068        let tail_limit = block_start as FastSint;
1069        while i >= tail_limit {
1070            c1 = c0;
1071            c0 = t[i as usize] as FastSint;
1072            f1 = f0;
1073            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1074            sa[m] = (i + 1) as SaSint;
1075            m -= f0 & !f1 & usize::from(c1 >= 0);
1076            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1077            i -= 1;
1078        }
1079
1080        c1 = if i >= 0 {
1081            t[i as usize] as FastSint
1082        } else {
1083            -1
1084        };
1085        f1 = usize::from(c1 > (c0 - f0 as FastSint));
1086        sa[m] = (i + 1) as SaSint;
1087        m -= f1 & !f0 & usize::from(c0 >= 0);
1088        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1089    }
1090
1091    (block_start + block_size - 1 - m) as SaSint
1092}
1093
1094/// Internal helper: get bucket stride.
1095#[doc(hidden)]
1096pub fn get_bucket_stride(
1097    free_space: FastSint,
1098    bucket_size: FastSint,
1099    num_buckets: FastSint,
1100) -> FastSint {
1101    let bucket_size_1024 = (bucket_size + 1023) & (-1024);
1102    if free_space / (num_buckets - 1) >= bucket_size_1024 {
1103        return bucket_size_1024;
1104    }
1105    let bucket_size_16 = (bucket_size + 15) & (-16);
1106    if free_space / (num_buckets - 1) >= bucket_size_16 {
1107        return bucket_size_16;
1108    }
1109    bucket_size
1110}
1111
1112/// Internal helper: count and gather lms suffixes 32s 4k nofs (OpenMP variant).
1113#[doc(hidden)]
1114pub fn count_and_gather_lms_suffixes_32s_4k_nofs_omp(
1115    t: &[SaSint],
1116    sa: &mut [SaSint],
1117    n: SaSint,
1118    k: SaSint,
1119    buckets: &mut [SaSint],
1120    threads: SaSint,
1121) -> SaSint {
1122    let m;
1123    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1124
1125    if omp_num_threads == 1 {
1126        m = count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint);
1127    } else {
1128        count_lms_suffixes_32s_4k(t, n, k, buckets);
1129        m = gather_lms_suffixes_32s(t, sa, n);
1130    }
1131
1132    m
1133}
1134
1135/// Internal helper: count and gather lms suffixes 32s 2k nofs (OpenMP variant).
1136#[doc(hidden)]
1137pub fn count_and_gather_lms_suffixes_32s_2k_nofs_omp(
1138    t: &[SaSint],
1139    sa: &mut [SaSint],
1140    n: SaSint,
1141    k: SaSint,
1142    buckets: &mut [SaSint],
1143    threads: SaSint,
1144) -> SaSint {
1145    let m;
1146    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1147
1148    if omp_num_threads == 1 {
1149        m = count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1150    } else {
1151        count_lms_suffixes_32s_2k(t, n, k, buckets);
1152        m = gather_lms_suffixes_32s(t, sa, n);
1153    }
1154
1155    m
1156}
1157
1158/// Internal helper: count and gather compacted lms suffixes 32s 2k nofs (OpenMP variant).
1159#[doc(hidden)]
1160pub fn count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
1161    t: &[SaSint],
1162    sa: &mut [SaSint],
1163    n: SaSint,
1164    k: SaSint,
1165    buckets: &mut [SaSint],
1166    threads: SaSint,
1167) -> SaSint {
1168    let m;
1169    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1170
1171    if omp_num_threads == 1 {
1172        m = count_and_gather_compacted_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1173    } else {
1174        count_compacted_lms_suffixes_32s_2k(t, n, k, buckets);
1175        m = gather_compacted_lms_suffixes_32s(t, sa, n);
1176    }
1177
1178    m
1179}
1180
1181/// Internal helper: count and gather lms suffixes 32s 4k fs (OpenMP variant).
1182#[doc(hidden)]
1183pub fn count_and_gather_lms_suffixes_32s_4k_fs_omp(
1184    t: &[SaSint],
1185    sa: &mut [SaSint],
1186    n: SaSint,
1187    k: SaSint,
1188    buckets: &mut [SaSint],
1189    local_buckets: SaSint,
1190    threads: SaSint,
1191    thread_state: &mut [ThreadState],
1192) -> SaSint {
1193    let n_usize = usize::try_from(n).expect("n must be non-negative");
1194    let k_usize = usize::try_from(k).expect("k must be non-negative");
1195    let omp_num_threads = usize::try_from(threads).expect("threads must be non-negative");
1196    let bucket_size = FastSint::try_from(4 * k_usize).expect("bucket size must fit FastSint");
1197
1198    if omp_num_threads <= 1 || n < 65_536 {
1199        return count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint);
1200    }
1201
1202    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
1203    let free_space = if local_buckets == 1 {
1204        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1205    } else if local_buckets > 1 {
1206        FastSint::try_from(local_buckets).expect("free space must fit FastSint")
1207    } else {
1208        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1209    };
1210    let bucket_stride = get_bucket_stride(
1211        free_space,
1212        bucket_size,
1213        FastSint::try_from(omp_num_threads).expect("thread count must fit FastSint"),
1214    );
1215    let bucket_size_usize = usize::try_from(bucket_size).expect("bucket size must be non-negative");
1216    let bucket_stride_usize =
1217        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1218    let workspace_len =
1219        bucket_size_usize + bucket_stride_usize.saturating_mul(omp_num_threads.saturating_sub(1));
1220    let mut workspace = vec![0; workspace_len];
1221
1222    for omp_thread_num in 0..omp_num_threads {
1223        let omp_block_start = omp_thread_num * omp_block_stride;
1224        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1225            omp_block_stride
1226        } else {
1227            n_usize - omp_block_start
1228        };
1229        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1230        let workspace_start = workspace_end - bucket_size_usize;
1231        let count = count_and_gather_lms_suffixes_32s_4k(
1232            t,
1233            sa,
1234            n,
1235            k,
1236            &mut workspace[workspace_start..workspace_end],
1237            omp_block_start as FastSint,
1238            omp_block_size as FastSint,
1239        );
1240
1241        thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1242        thread_state[omp_thread_num].count = count as FastSint;
1243    }
1244
1245    let mut m = 0;
1246    for t in (0..omp_num_threads).rev() {
1247        m += thread_state[t].count as SaSint;
1248
1249        if t + 1 != omp_num_threads && thread_state[t].count > 0 {
1250            let src_end =
1251                usize::try_from(thread_state[t].position).expect("position must be non-negative");
1252            let src_start = src_end
1253                - usize::try_from(thread_state[t].count).expect("count must be non-negative");
1254            let dst_start = usize::try_from(n - m).expect("destination must be non-negative");
1255            sa.copy_within(src_start..src_end, dst_start);
1256        }
1257    }
1258
1259    let omp_num_threads = omp_num_threads - 1;
1260    let omp_block_stride = (bucket_size_usize / omp_num_threads) & !15usize;
1261    for omp_thread_num in 0..omp_num_threads {
1262        let omp_block_start = omp_thread_num * omp_block_stride;
1263        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1264            omp_block_stride
1265        } else {
1266            bucket_size_usize - omp_block_start
1267        };
1268        accumulate_counts_s32(
1269            &mut workspace[omp_block_start..],
1270            omp_block_size as FastSint,
1271            bucket_stride,
1272            FastSint::try_from(omp_num_threads + 1).expect("thread count must fit FastSint"),
1273        );
1274    }
1275
1276    let accumulated_start = omp_num_threads * bucket_stride_usize;
1277    buckets[..bucket_size_usize]
1278        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size_usize]);
1279    m
1280}
1281
1282/// Internal helper: count and gather lms suffixes 32s 2k fs (OpenMP variant).
1283#[doc(hidden)]
1284pub fn count_and_gather_lms_suffixes_32s_2k_fs_omp(
1285    t: &[SaSint],
1286    sa: &mut [SaSint],
1287    n: SaSint,
1288    k: SaSint,
1289    buckets: &mut [SaSint],
1290    local_buckets: SaSint,
1291    threads: SaSint,
1292    thread_state: &mut [ThreadState],
1293) -> SaSint {
1294    let n_usize = usize::try_from(n).expect("n must be non-negative");
1295    let k_usize = usize::try_from(k).expect("k must be non-negative");
1296    let omp_num_threads = usize::try_from(threads).expect("threads must be non-negative");
1297    let bucket_size = FastSint::try_from(2 * k_usize).expect("bucket size must fit FastSint");
1298
1299    if omp_num_threads <= 1 || n < 65_536 {
1300        return count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1301    }
1302
1303    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
1304    let free_space = if local_buckets == 1 {
1305        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1306    } else if local_buckets > 1 {
1307        FastSint::try_from(local_buckets).expect("free space must fit FastSint")
1308    } else {
1309        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1310    };
1311    let bucket_stride = get_bucket_stride(
1312        free_space,
1313        bucket_size,
1314        FastSint::try_from(omp_num_threads).expect("thread count must fit FastSint"),
1315    );
1316    let bucket_size_usize = usize::try_from(bucket_size).expect("bucket size must be non-negative");
1317    let bucket_stride_usize =
1318        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1319    let workspace_len =
1320        bucket_size_usize + bucket_stride_usize.saturating_mul(omp_num_threads.saturating_sub(1));
1321    let mut workspace = vec![0; workspace_len];
1322
1323    for omp_thread_num in 0..omp_num_threads {
1324        let omp_block_start = omp_thread_num * omp_block_stride;
1325        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1326            omp_block_stride
1327        } else {
1328            n_usize - omp_block_start
1329        };
1330        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1331        let workspace_start = workspace_end - bucket_size_usize;
1332        let count = count_and_gather_lms_suffixes_32s_2k(
1333            t,
1334            sa,
1335            n,
1336            k,
1337            &mut workspace[workspace_start..workspace_end],
1338            omp_block_start as FastSint,
1339            omp_block_size as FastSint,
1340        );
1341
1342        thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1343        thread_state[omp_thread_num].count = count as FastSint;
1344    }
1345
1346    let mut m = 0;
1347    for t in (0..omp_num_threads).rev() {
1348        m += thread_state[t].count as SaSint;
1349        if t + 1 != omp_num_threads && thread_state[t].count > 0 {
1350            let src_end =
1351                usize::try_from(thread_state[t].position).expect("position must be non-negative");
1352            let src_start = src_end
1353                - usize::try_from(thread_state[t].count).expect("count must be non-negative");
1354            let dst_start = usize::try_from(n - m).expect("destination must be non-negative");
1355            sa.copy_within(src_start..src_end, dst_start);
1356        }
1357    }
1358
1359    let omp_num_threads = omp_num_threads - 1;
1360    let omp_block_stride = (bucket_size_usize / omp_num_threads) & !15usize;
1361    for omp_thread_num in 0..omp_num_threads {
1362        let omp_block_start = omp_thread_num * omp_block_stride;
1363        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1364            omp_block_stride
1365        } else {
1366            bucket_size_usize - omp_block_start
1367        };
1368        accumulate_counts_s32(
1369            &mut workspace[omp_block_start..],
1370            omp_block_size as FastSint,
1371            bucket_stride,
1372            FastSint::try_from(omp_num_threads + 1).expect("thread count must fit FastSint"),
1373        );
1374    }
1375
1376    let accumulated_start = omp_num_threads * bucket_stride_usize;
1377    buckets[..bucket_size_usize]
1378        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size_usize]);
1379    m
1380}
1381
1382/// Internal helper: count and gather compacted lms suffixes 32s 2k fs (OpenMP variant).
1383#[doc(hidden)]
1384pub fn count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
1385    t: &[SaSint],
1386    sa: &mut [SaSint],
1387    n: SaSint,
1388    k: SaSint,
1389    buckets: &mut [SaSint],
1390    _local_buckets: SaSint,
1391    threads: SaSint,
1392    thread_state: &mut [ThreadState],
1393) {
1394    let n_usize = usize::try_from(n).expect("n must be non-negative");
1395    let k_usize = usize::try_from(k).expect("k must be non-negative");
1396    let thread_count = usize::try_from(threads).expect("threads must be non-negative");
1397    let bucket_size = 2 * k_usize;
1398
1399    if thread_count <= 1 || n < 65_536 {
1400        let _ =
1401            count_and_gather_compacted_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1402        return;
1403    }
1404
1405    if thread_state.len() < thread_count || sa.len() < 2 * n_usize {
1406        let _ =
1407            count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(t, sa, n, k, buckets, threads);
1408        return;
1409    }
1410
1411    let omp_block_stride = (n_usize / thread_count) & !15usize;
1412    let free_space = if _local_buckets != 0 {
1413        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1414    } else {
1415        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1416    };
1417    let bucket_stride = get_bucket_stride(
1418        free_space,
1419        FastSint::try_from(bucket_size).expect("bucket size must fit FastSint"),
1420        FastSint::try_from(thread_count).expect("thread count must fit FastSint"),
1421    );
1422    let bucket_stride_usize =
1423        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1424    let workspace_len =
1425        bucket_size + bucket_stride_usize.saturating_mul(thread_count.saturating_sub(1));
1426    let mut workspace = vec![0; workspace_len];
1427
1428    for omp_thread_num in 0..thread_count {
1429        let omp_block_start = omp_thread_num * omp_block_stride;
1430        let omp_block_size = if omp_thread_num + 1 < thread_count {
1431            omp_block_stride
1432        } else {
1433            n_usize - omp_block_start
1434        };
1435
1436        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1437        let workspace_start = workspace_end - bucket_size;
1438        let count = count_and_gather_compacted_lms_suffixes_32s_2k(
1439            t,
1440            &mut sa[n_usize..],
1441            n,
1442            k,
1443            &mut workspace[workspace_start..workspace_end],
1444            omp_block_start as FastSint,
1445            omp_block_size as FastSint,
1446        );
1447
1448        if omp_thread_num < thread_state.len() {
1449            thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1450            thread_state[omp_thread_num].count = count as FastSint;
1451        }
1452    }
1453
1454    let mut m = 0usize;
1455    for omp_thread_num in (0..thread_count).rev() {
1456        let count = usize::try_from(thread_state[omp_thread_num].count)
1457            .expect("count must be non-negative");
1458        m += count;
1459        if count > 0 {
1460            let position = usize::try_from(thread_state[omp_thread_num].position)
1461                .expect("position must be non-negative");
1462            let src_start = n_usize + position - count;
1463            let src_end = n_usize + position;
1464            let dst_start = n_usize - m;
1465            sa.copy_within(src_start..src_end, dst_start);
1466        }
1467    }
1468
1469    let accumulation_threads = thread_count;
1470    let omp_block_stride = (bucket_size / accumulation_threads) & !15usize;
1471    for omp_thread_num in 0..accumulation_threads {
1472        let omp_block_start = omp_thread_num * omp_block_stride;
1473        let omp_block_size = if omp_thread_num + 1 < accumulation_threads {
1474            omp_block_stride
1475        } else {
1476            bucket_size - omp_block_start
1477        };
1478        accumulate_counts_s32(
1479            &mut workspace[omp_block_start..],
1480            omp_block_size as FastSint,
1481            bucket_stride,
1482            FastSint::try_from(thread_count).expect("thread count must fit FastSint"),
1483        );
1484    }
1485    let accumulated_start = (accumulation_threads - 1) * bucket_stride_usize;
1486    buckets[..bucket_size]
1487        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size]);
1488}
1489
1490/// Internal helper: count and gather lms suffixes 32s 4k (OpenMP variant).
1491#[doc(hidden)]
1492pub fn count_and_gather_lms_suffixes_32s_4k_omp(
1493    t: &[SaSint],
1494    sa: &mut [SaSint],
1495    n: SaSint,
1496    k: SaSint,
1497    buckets: &mut [SaSint],
1498    local_buckets: SaSint,
1499    threads: SaSint,
1500    thread_state: &mut [ThreadState],
1501) -> SaSint {
1502    let free_space = if local_buckets != 0 {
1503        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1504    } else {
1505        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1506    };
1507    let threads_fast = threads as FastSint;
1508    let mut max_threads = (free_space / (((4 * k as FastSint) + 15) & -16)).min(threads_fast);
1509
1510    if max_threads > 1 && n >= 65_536 && n / k >= 2 {
1511        let thread_cap = (n / (16 * k)) as FastSint;
1512        if max_threads > thread_cap {
1513            max_threads = thread_cap;
1514        }
1515        return count_and_gather_lms_suffixes_32s_4k_fs_omp(
1516            t,
1517            sa,
1518            n,
1519            k,
1520            buckets,
1521            local_buckets,
1522            max_threads.max(2) as SaSint,
1523            thread_state,
1524        );
1525    }
1526
1527    if threads > 1 && n >= 65_536 {
1528        count_lms_suffixes_32s_4k(t, n, k, buckets);
1529        gather_lms_suffixes_32s(t, sa, n)
1530    } else {
1531        count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint)
1532    }
1533}
1534
1535/// Internal helper: count and gather lms suffixes 32s 2k (OpenMP variant).
1536#[doc(hidden)]
1537pub fn count_and_gather_lms_suffixes_32s_2k_omp(
1538    t: &[SaSint],
1539    sa: &mut [SaSint],
1540    n: SaSint,
1541    k: SaSint,
1542    buckets: &mut [SaSint],
1543    local_buckets: SaSint,
1544    threads: SaSint,
1545    thread_state: &mut [ThreadState],
1546) -> SaSint {
1547    let free_space = if local_buckets != 0 {
1548        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1549    } else {
1550        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1551    };
1552    let threads_fast = threads as FastSint;
1553    let mut max_threads = (free_space / (((2 * k as FastSint) + 15) & -16)).min(threads_fast);
1554
1555    if max_threads > 1 && n >= 65_536 && n / k >= 2 {
1556        let thread_cap = (n / (8 * k)) as FastSint;
1557        if max_threads > thread_cap {
1558            max_threads = thread_cap;
1559        }
1560        return count_and_gather_lms_suffixes_32s_2k_fs_omp(
1561            t,
1562            sa,
1563            n,
1564            k,
1565            buckets,
1566            local_buckets,
1567            max_threads.max(2) as SaSint,
1568            thread_state,
1569        );
1570    }
1571
1572    if threads > 1 && n >= 65_536 {
1573        count_lms_suffixes_32s_2k(t, n, k, buckets);
1574        gather_lms_suffixes_32s(t, sa, n)
1575    } else {
1576        count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint)
1577    }
1578}
1579
1580/// Internal helper: count and gather compacted lms suffixes 32s 2k (OpenMP variant).
1581#[doc(hidden)]
1582pub fn count_and_gather_compacted_lms_suffixes_32s_2k_omp(
1583    t: &[SaSint],
1584    sa: &mut [SaSint],
1585    n: SaSint,
1586    k: SaSint,
1587    buckets: &mut [SaSint],
1588    local_buckets: SaSint,
1589    threads: SaSint,
1590    thread_state: &mut [ThreadState],
1591) {
1592    let free_space = if local_buckets != 0 {
1593        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1594    } else {
1595        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1596    };
1597    let threads_fast = threads as FastSint;
1598    let mut max_threads = (free_space / (((2 * k as FastSint) + 15) & -16)).min(threads_fast);
1599
1600    if local_buckets == 0 && max_threads > 1 && n >= 65_536 && n / k >= 2 {
1601        let thread_cap = (n / (8 * k)) as FastSint;
1602        if max_threads > thread_cap {
1603            max_threads = thread_cap;
1604        }
1605        count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
1606            t,
1607            sa,
1608            n,
1609            k,
1610            buckets,
1611            local_buckets,
1612            max_threads.max(2) as SaSint,
1613            thread_state,
1614        );
1615        return;
1616    }
1617
1618    let _ = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(t, sa, n, k, buckets, threads);
1619}
1620
1621/// Internal helper: count suffixes 32s.
1622#[doc(hidden)]
1623pub fn count_suffixes_32s(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
1624    let n_usize = usize::try_from(n).expect("n must be non-negative");
1625    let k_usize = usize::try_from(k).expect("k must be non-negative");
1626    buckets[..k_usize].fill(0);
1627
1628    let mut i = 0usize;
1629    let mut j = n_usize.saturating_sub(7);
1630    while i < j {
1631        buckets[t[i] as usize] += 1;
1632        buckets[t[i + 1] as usize] += 1;
1633        buckets[t[i + 2] as usize] += 1;
1634        buckets[t[i + 3] as usize] += 1;
1635        buckets[t[i + 4] as usize] += 1;
1636        buckets[t[i + 5] as usize] += 1;
1637        buckets[t[i + 6] as usize] += 1;
1638        buckets[t[i + 7] as usize] += 1;
1639        i += 8;
1640    }
1641
1642    j += 7;
1643    while i < j {
1644        buckets[t[i] as usize] += 1;
1645        i += 1;
1646    }
1647}
1648
1649/// Internal helper: initialize buckets start and end 8u.
1650#[doc(hidden)]
1651pub fn initialize_buckets_start_and_end_8u(
1652    buckets: &mut [SaSint],
1653    freq: Option<&mut [SaSint]>,
1654) -> SaSint {
1655    let start_offset = 6 * ALPHABET_SIZE;
1656    let end_offset = 7 * ALPHABET_SIZE;
1657    let mut k = -1isize;
1658    let mut sum = 0;
1659
1660    match freq {
1661        Some(freq) => {
1662            for j in 0..ALPHABET_SIZE {
1663                let i = buckets_index4(j, 0);
1664                let total = buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1665                buckets[start_offset + j] = sum;
1666                sum += total;
1667                buckets[end_offset + j] = sum;
1668                if total > 0 {
1669                    k = j as isize;
1670                }
1671                freq[j] = total;
1672            }
1673        }
1674        None => {
1675            for j in 0..ALPHABET_SIZE {
1676                let i = buckets_index4(j, 0);
1677                let total = buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1678                buckets[start_offset + j] = sum;
1679                sum += total;
1680                buckets[end_offset + j] = sum;
1681                if total > 0 {
1682                    k = j as isize;
1683                }
1684            }
1685        }
1686    }
1687
1688    (k + 1) as SaSint
1689}
1690
1691/// Internal helper: initialize buckets start and end 32s 6k.
1692#[doc(hidden)]
1693pub fn initialize_buckets_start_and_end_32s_6k(k: SaSint, buckets: &mut [SaSint]) {
1694    let k_usize = usize::try_from(k).expect("k must be non-negative");
1695    let start_offset = 4 * k_usize;
1696    let end_offset = 5 * k_usize;
1697    let mut sum = 0;
1698    for j in 0..k_usize {
1699        let i = buckets_index4(j, 0);
1700        buckets[start_offset + j] = sum;
1701        sum += buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1702        buckets[end_offset + j] = sum;
1703    }
1704}
1705
1706/// Internal helper: initialize buckets start and end 32s 4k.
1707#[doc(hidden)]
1708pub fn initialize_buckets_start_and_end_32s_4k(k: SaSint, buckets: &mut [SaSint]) {
1709    let k_usize = usize::try_from(k).expect("k must be non-negative");
1710    let start_offset = 2 * k_usize;
1711    let end_offset = 3 * k_usize;
1712    let mut sum = 0;
1713    for j in 0..k_usize {
1714        let i = buckets_index2(j, 0);
1715        buckets[start_offset + j] = sum;
1716        sum += buckets[i] + buckets[i + 1];
1717        buckets[end_offset + j] = sum;
1718    }
1719}
1720
1721/// Internal helper: initialize buckets end 32s 2k.
1722#[doc(hidden)]
1723pub fn initialize_buckets_end_32s_2k(k: SaSint, buckets: &mut [SaSint]) {
1724    let k_usize = usize::try_from(k).expect("k must be non-negative");
1725    let mut sum0 = 0;
1726    for j in 0..k_usize {
1727        let i = buckets_index2(j, 0);
1728        sum0 += buckets[i] + buckets[i + 1];
1729        buckets[i] = sum0;
1730    }
1731}
1732
1733/// Internal helper: initialize buckets start and end 32s 2k.
1734#[doc(hidden)]
1735pub fn initialize_buckets_start_and_end_32s_2k(k: SaSint, buckets: &mut [SaSint]) {
1736    let k_usize = usize::try_from(k).expect("k must be non-negative");
1737    for j in 0..k_usize {
1738        let i = buckets_index2(j, 0);
1739        buckets[j] = buckets[i];
1740    }
1741    buckets[k_usize] = 0;
1742    for j in 1..k_usize {
1743        buckets[k_usize + j] = buckets[j - 1];
1744    }
1745}
1746
1747/// Internal helper: initialize buckets start 32s 1k.
1748#[doc(hidden)]
1749pub fn initialize_buckets_start_32s_1k(k: SaSint, buckets: &mut [SaSint]) {
1750    let k_usize = usize::try_from(k).expect("k must be non-negative");
1751    let mut sum = 0;
1752    for bucket in buckets.iter_mut().take(k_usize) {
1753        let tmp = *bucket;
1754        *bucket = sum;
1755        sum += tmp;
1756    }
1757}
1758
1759/// Internal helper: initialize buckets end 32s 1k.
1760#[doc(hidden)]
1761pub fn initialize_buckets_end_32s_1k(k: SaSint, buckets: &mut [SaSint]) {
1762    let k_usize = usize::try_from(k).expect("k must be non-negative");
1763    let mut sum = 0;
1764    for bucket in buckets.iter_mut().take(k_usize) {
1765        sum += *bucket;
1766        *bucket = sum;
1767    }
1768}
1769
1770/// Internal helper: initialize buckets for lms suffixes radix sort 8u.
1771#[doc(hidden)]
1772pub fn initialize_buckets_for_lms_suffixes_radix_sort_8u(
1773    t: &[u8],
1774    buckets: &mut [SaSint],
1775    mut first_lms_suffix: SaSint,
1776) -> SaSint {
1777    let mut f0 = 0usize;
1778    let mut f1: usize;
1779    let mut c0 = t[first_lms_suffix as usize] as FastSint;
1780    let mut c1: FastSint;
1781
1782    while {
1783        first_lms_suffix -= 1;
1784        first_lms_suffix >= 0
1785    } {
1786        c1 = c0;
1787        c0 = t[first_lms_suffix as usize] as FastSint;
1788        f1 = f0;
1789        f0 = usize::from(c0 > (c1 - f1 as FastSint));
1790        let idx = 4 * c1 as usize + (f1 + f1 + f0);
1791        buckets[idx] -= 1;
1792    }
1793    buckets[4 * c0 as usize + (f0 + f0)] -= 1;
1794
1795    let temp_offset = 4 * ALPHABET_SIZE;
1796    let mut sum = 0;
1797    for j in 0..ALPHABET_SIZE {
1798        let i = 4 * j;
1799        let tj = 2 * j;
1800        buckets[temp_offset + tj + 1] = sum;
1801        sum += buckets[i + 1] + buckets[i + 3];
1802        buckets[temp_offset + tj] = sum;
1803    }
1804    sum
1805}
1806
1807/// Internal helper: initialize buckets for lms suffixes radix sort 32s 2k.
1808#[doc(hidden)]
1809pub fn initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
1810    t: &[SaSint],
1811    k: SaSint,
1812    buckets: &mut [SaSint],
1813    first_lms_suffix: SaSint,
1814) {
1815    let _k_usize = usize::try_from(k).expect("k must be non-negative");
1816    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 0)] += 1;
1817    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 1)] -= 1;
1818
1819    let mut sum0 = 0;
1820    let mut sum1 = 0;
1821    for j in 0..usize::try_from(k).unwrap() {
1822        let i = buckets_index2(j, 0);
1823        sum0 += buckets[i] + buckets[i + 1];
1824        sum1 += buckets[i + 1];
1825        buckets[i] = sum0;
1826        buckets[i + 1] = sum1;
1827    }
1828}
1829
1830/// Internal helper: initialize buckets for lms suffixes radix sort 32s 6k.
1831#[doc(hidden)]
1832pub fn initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
1833    t: &[SaSint],
1834    k: SaSint,
1835    buckets: &mut [SaSint],
1836    mut first_lms_suffix: SaSint,
1837) -> SaSint {
1838    let mut f0 = 0usize;
1839    let mut f1: usize;
1840    let mut c0 = t[first_lms_suffix as usize] as FastSint;
1841    let mut c1: FastSint;
1842
1843    while {
1844        first_lms_suffix -= 1;
1845        first_lms_suffix >= 0
1846    } {
1847        c1 = c0;
1848        c0 = t[first_lms_suffix as usize] as FastSint;
1849        f1 = f0;
1850        f0 = usize::from(c0 > (c1 - f1 as FastSint));
1851        buckets[4 * c1 as usize + (f1 + f1 + f0)] -= 1;
1852    }
1853    buckets[4 * c0 as usize + (f0 + f0)] -= 1;
1854
1855    let temp_offset = 4 * usize::try_from(k).unwrap();
1856    let mut sum = 0;
1857    for j in 0..usize::try_from(k).unwrap() {
1858        let i = 4 * j;
1859        sum += buckets[i + 1] + buckets[i + 3];
1860        buckets[temp_offset + j] = sum;
1861    }
1862    sum
1863}
1864
1865/// Internal helper: initialize buckets for radix and partial sorting 32s 4k.
1866#[doc(hidden)]
1867pub fn initialize_buckets_for_radix_and_partial_sorting_32s_4k(
1868    t: &[SaSint],
1869    k: SaSint,
1870    buckets: &mut [SaSint],
1871    first_lms_suffix: SaSint,
1872) {
1873    let k_usize = usize::try_from(k).expect("k must be non-negative");
1874    let start_offset = 2 * k_usize;
1875    let end_offset = 3 * k_usize;
1876
1877    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 0)] += 1;
1878    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 1)] -= 1;
1879
1880    let mut sum0 = 0;
1881    let mut sum1 = 0;
1882    for j in 0..k_usize {
1883        let i = buckets_index2(j, 0);
1884        buckets[start_offset + j] = sum1;
1885        sum0 += buckets[i + 1];
1886        sum1 += buckets[i] + buckets[i + 1];
1887        buckets[i + 1] = sum0;
1888        buckets[end_offset + j] = sum1;
1889    }
1890}
1891
1892/// Internal helper: radix sort lms suffixes 8u.
1893#[doc(hidden)]
1894pub fn radix_sort_lms_suffixes_8u(
1895    t: &[u8],
1896    sa: &mut [SaSint],
1897    induction_bucket: &mut [SaSint],
1898    omp_block_start: FastSint,
1899    omp_block_size: FastSint,
1900) {
1901    let prefetch_distance = 64 as FastSint;
1902    let mut i = omp_block_start + omp_block_size - 1;
1903    let mut j = omp_block_start + prefetch_distance + 3;
1904
1905    while i >= j {
1906        let p0 = sa[i as usize];
1907        let idx0 = buckets_index2(t[p0 as usize] as usize, 0);
1908        induction_bucket[idx0] -= 1;
1909        sa[induction_bucket[idx0] as usize] = p0;
1910
1911        let p1 = sa[(i - 1) as usize];
1912        let idx1 = buckets_index2(t[p1 as usize] as usize, 0);
1913        induction_bucket[idx1] -= 1;
1914        sa[induction_bucket[idx1] as usize] = p1;
1915
1916        let p2 = sa[(i - 2) as usize];
1917        let idx2 = buckets_index2(t[p2 as usize] as usize, 0);
1918        induction_bucket[idx2] -= 1;
1919        sa[induction_bucket[idx2] as usize] = p2;
1920
1921        let p3 = sa[(i - 3) as usize];
1922        let idx3 = buckets_index2(t[p3 as usize] as usize, 0);
1923        induction_bucket[idx3] -= 1;
1924        sa[induction_bucket[idx3] as usize] = p3;
1925
1926        i -= 4;
1927    }
1928
1929    j -= prefetch_distance + 3;
1930    while i >= j {
1931        let p = sa[i as usize];
1932        let idx = buckets_index2(t[p as usize] as usize, 0);
1933        induction_bucket[idx] -= 1;
1934        sa[induction_bucket[idx] as usize] = p;
1935        i -= 1;
1936    }
1937}
1938
1939/// Internal helper: radix sort lms suffixes 8u (OpenMP variant).
1940#[doc(hidden)]
1941pub fn radix_sort_lms_suffixes_8u_omp(
1942    t: &[u8],
1943    sa: &mut [SaSint],
1944    n: SaSint,
1945    m: SaSint,
1946    flags: SaSint,
1947    buckets: &mut [SaSint],
1948    threads: SaSint,
1949    thread_state: &mut [ThreadState],
1950) {
1951    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
1952        buckets[4 * ALPHABET_SIZE] -= 1;
1953    }
1954
1955    let omp_num_threads = if threads > 1 && n >= 65_536 && m >= 65_536 {
1956        usize::try_from(threads)
1957            .expect("threads must be non-negative")
1958            .min(thread_state.len())
1959            .max(1)
1960    } else {
1961        1
1962    };
1963
1964    if omp_num_threads == 1 {
1965        radix_sort_lms_suffixes_8u(
1966            t,
1967            sa,
1968            &mut buckets[4 * ALPHABET_SIZE..],
1969            n as FastSint - m as FastSint + 1,
1970            m as FastSint - 1,
1971        );
1972        return;
1973    }
1974
1975    let (_, src_bucket) = buckets.split_at_mut(4 * ALPHABET_SIZE);
1976
1977    for state in thread_state.iter_mut().take(omp_num_threads) {
1978        for (i, j) in (0..=buckets_index2(ALPHABET_SIZE - 1, 0))
1979            .step_by(buckets_index2(1, 0))
1980            .zip((buckets_index4(0, 1)..).step_by(buckets_index4(1, 0)))
1981        {
1982            state.buckets[i] = src_bucket[i] - state.buckets[j];
1983        }
1984    }
1985
1986    for thread_num in 0..omp_num_threads {
1987        let mut omp_block_start = 0;
1988        for state in thread_state
1989            .iter()
1990            .take(omp_num_threads)
1991            .skip(thread_num)
1992            .rev()
1993        {
1994            omp_block_start += state.m;
1995        }
1996
1997        let mut omp_block_size = thread_state[thread_num].m;
1998        if omp_block_start == m as FastSint && omp_block_size > 0 {
1999            omp_block_start -= 1;
2000            omp_block_size -= 1;
2001        }
2002
2003        radix_sort_lms_suffixes_8u(
2004            t,
2005            sa,
2006            &mut thread_state[thread_num].buckets,
2007            n as FastSint - omp_block_start,
2008            omp_block_size,
2009        );
2010    }
2011}
2012
2013/// Internal helper: radix sort lms suffixes 32s 6k.
2014#[doc(hidden)]
2015pub fn radix_sort_lms_suffixes_32s_6k(
2016    t: &[SaSint],
2017    sa: &mut [SaSint],
2018    induction_bucket: &mut [SaSint],
2019    omp_block_start: FastSint,
2020    omp_block_size: FastSint,
2021) {
2022    let prefetch_distance = 64 as FastSint;
2023    let mut i = omp_block_start + omp_block_size - 1;
2024    let mut j = omp_block_start + 2 * prefetch_distance + 3;
2025
2026    while i >= j {
2027        let p0 = sa[i as usize];
2028        let idx0 = t[p0 as usize] as usize;
2029        induction_bucket[idx0] -= 1;
2030        sa[induction_bucket[idx0] as usize] = p0;
2031
2032        let p1 = sa[(i - 1) as usize];
2033        let idx1 = t[p1 as usize] as usize;
2034        induction_bucket[idx1] -= 1;
2035        sa[induction_bucket[idx1] as usize] = p1;
2036
2037        let p2 = sa[(i - 2) as usize];
2038        let idx2 = t[p2 as usize] as usize;
2039        induction_bucket[idx2] -= 1;
2040        sa[induction_bucket[idx2] as usize] = p2;
2041
2042        let p3 = sa[(i - 3) as usize];
2043        let idx3 = t[p3 as usize] as usize;
2044        induction_bucket[idx3] -= 1;
2045        sa[induction_bucket[idx3] as usize] = p3;
2046
2047        i -= 4;
2048    }
2049
2050    j -= 2 * prefetch_distance + 3;
2051    while i >= j {
2052        let p = sa[i as usize];
2053        let idx = t[p as usize] as usize;
2054        induction_bucket[idx] -= 1;
2055        sa[induction_bucket[idx] as usize] = p;
2056        i -= 1;
2057    }
2058}
2059
2060/// Internal helper: radix sort lms suffixes 32s 2k.
2061#[doc(hidden)]
2062pub fn radix_sort_lms_suffixes_32s_2k(
2063    t: &[SaSint],
2064    sa: &mut [SaSint],
2065    induction_bucket: &mut [SaSint],
2066    omp_block_start: FastSint,
2067    omp_block_size: FastSint,
2068) {
2069    let prefetch_distance = 64 as FastSint;
2070    let mut i = omp_block_start + omp_block_size - 1;
2071    let mut j = omp_block_start + 2 * prefetch_distance + 3;
2072
2073    while i >= j {
2074        let p0 = sa[i as usize];
2075        let idx0 = buckets_index2(t[p0 as usize] as usize, 0);
2076        induction_bucket[idx0] -= 1;
2077        sa[induction_bucket[idx0] as usize] = p0;
2078
2079        let p1 = sa[(i - 1) as usize];
2080        let idx1 = buckets_index2(t[p1 as usize] as usize, 0);
2081        induction_bucket[idx1] -= 1;
2082        sa[induction_bucket[idx1] as usize] = p1;
2083
2084        let p2 = sa[(i - 2) as usize];
2085        let idx2 = buckets_index2(t[p2 as usize] as usize, 0);
2086        induction_bucket[idx2] -= 1;
2087        sa[induction_bucket[idx2] as usize] = p2;
2088
2089        let p3 = sa[(i - 3) as usize];
2090        let idx3 = buckets_index2(t[p3 as usize] as usize, 0);
2091        induction_bucket[idx3] -= 1;
2092        sa[induction_bucket[idx3] as usize] = p3;
2093
2094        i -= 4;
2095    }
2096
2097    j -= 2 * prefetch_distance + 3;
2098    while i >= j {
2099        let p = sa[i as usize];
2100        let idx = buckets_index2(t[p as usize] as usize, 0);
2101        induction_bucket[idx] -= 1;
2102        sa[induction_bucket[idx] as usize] = p;
2103        i -= 1;
2104    }
2105}
2106
2107/// Internal helper: radix sort lms suffixes 32s block gather.
2108#[doc(hidden)]
2109pub fn radix_sort_lms_suffixes_32s_block_gather(
2110    t: &[SaSint],
2111    sa: &[SaSint],
2112    cache: &mut [ThreadCache],
2113    omp_block_start: FastSint,
2114    omp_block_size: FastSint,
2115) {
2116    if omp_block_size <= 0 {
2117        return;
2118    }
2119
2120    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
2121    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
2122    let mut i = start;
2123    let mut j = if size > 67 { start + size - 67 } else { start };
2124
2125    while i < j {
2126        for current in [i, i + 1, i + 2, i + 3] {
2127            let ci = current - start;
2128            let index = sa[current];
2129            cache[ci].index = index;
2130            cache[ci].symbol = t[index as usize];
2131        }
2132        i += 4;
2133    }
2134
2135    j = if size > 67 { j + 67 } else { start + size };
2136    while i < j {
2137        let ci = i - start;
2138        let index = sa[i];
2139        cache[ci].index = index;
2140        cache[ci].symbol = t[index as usize];
2141        i += 1;
2142    }
2143}
2144
2145/// Internal helper: radix sort lms suffixes 32s 6k block sort.
2146#[doc(hidden)]
2147pub fn radix_sort_lms_suffixes_32s_6k_block_sort(
2148    induction_bucket: &mut [SaSint],
2149    cache: &mut [ThreadCache],
2150    omp_block_start: FastSint,
2151    omp_block_size: FastSint,
2152) {
2153    if omp_block_size <= 0 {
2154        return;
2155    }
2156
2157    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
2158    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
2159    let mut i = start + size - 1;
2160    let mut j = start + 64 + 3;
2161
2162    while i >= j {
2163        for current in [i, i - 1, i - 2, i - 3] {
2164            let ci = current - start;
2165            let v = cache[ci].symbol as usize;
2166            induction_bucket[v] -= 1;
2167            cache[ci].symbol = induction_bucket[v];
2168        }
2169        i -= 4;
2170    }
2171
2172    j -= 64 + 3;
2173    while i >= j {
2174        let ci = i - start;
2175        let v = cache[ci].symbol as usize;
2176        induction_bucket[v] -= 1;
2177        cache[ci].symbol = induction_bucket[v];
2178        if i == 0 {
2179            break;
2180        }
2181        i -= 1;
2182    }
2183}
2184
2185/// Internal helper: radix sort lms suffixes 32s 2k block sort.
2186#[doc(hidden)]
2187pub fn radix_sort_lms_suffixes_32s_2k_block_sort(
2188    induction_bucket: &mut [SaSint],
2189    cache: &mut [ThreadCache],
2190    omp_block_start: FastSint,
2191    omp_block_size: FastSint,
2192) {
2193    if omp_block_size <= 0 {
2194        return;
2195    }
2196
2197    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
2198    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
2199    let mut i = start + size - 1;
2200    let mut j = start + 64 + 3;
2201
2202    while i >= j {
2203        for current in [i, i - 1, i - 2, i - 3] {
2204            let ci = current - start;
2205            let v = buckets_index2(cache[ci].symbol as usize, 0);
2206            induction_bucket[v] -= 1;
2207            cache[ci].symbol = induction_bucket[v];
2208        }
2209        i -= 4;
2210    }
2211
2212    j -= 64 + 3;
2213    while i >= j {
2214        let ci = i - start;
2215        let v = buckets_index2(cache[ci].symbol as usize, 0);
2216        induction_bucket[v] -= 1;
2217        cache[ci].symbol = induction_bucket[v];
2218        if i == 0 {
2219            break;
2220        }
2221        i -= 1;
2222    }
2223}
2224
2225/// Internal helper: radix sort lms suffixes 32s 6k block (OpenMP variant).
2226#[doc(hidden)]
2227pub fn radix_sort_lms_suffixes_32s_6k_block_omp(
2228    t: &[SaSint],
2229    sa: &mut [SaSint],
2230    induction_bucket: &mut [SaSint],
2231    cache: &mut [ThreadCache],
2232    block_start: FastSint,
2233    block_size: FastSint,
2234    threads: SaSint,
2235) {
2236    if threads <= 1 || block_size < 16_384 {
2237        radix_sort_lms_suffixes_32s_6k(t, sa, induction_bucket, block_start, block_size);
2238        return;
2239    }
2240
2241    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
2242    let threads_usize = usize::try_from(threads)
2243        .expect("threads must be positive")
2244        .min(block_size_usize.max(1));
2245    let omp_block_stride = (block_size_usize / threads_usize) & !15usize;
2246
2247    for omp_thread_num in 0..threads_usize {
2248        let omp_block_start = omp_thread_num * omp_block_stride;
2249        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2250            omp_block_stride
2251        } else {
2252            block_size_usize - omp_block_start
2253        };
2254        if omp_block_size > 0 {
2255            radix_sort_lms_suffixes_32s_block_gather(
2256                t,
2257                sa,
2258                &mut cache[omp_block_start..],
2259                block_start + omp_block_start as FastSint,
2260                omp_block_size as FastSint,
2261            );
2262        }
2263    }
2264
2265    radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache, block_start, block_size);
2266
2267    for omp_thread_num in 0..threads_usize {
2268        let omp_block_start = omp_thread_num * omp_block_stride;
2269        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2270            omp_block_stride
2271        } else {
2272            block_size_usize - omp_block_start
2273        };
2274        if omp_block_size > 0 {
2275            place_cached_suffixes(sa, &cache[omp_block_start..], 0, omp_block_size as FastSint);
2276        }
2277    }
2278}
2279
2280/// Internal helper: radix sort lms suffixes 32s 2k block (OpenMP variant).
2281#[doc(hidden)]
2282pub fn radix_sort_lms_suffixes_32s_2k_block_omp(
2283    t: &[SaSint],
2284    sa: &mut [SaSint],
2285    induction_bucket: &mut [SaSint],
2286    cache: &mut [ThreadCache],
2287    block_start: FastSint,
2288    block_size: FastSint,
2289    threads: SaSint,
2290) {
2291    if threads <= 1 || block_size < 16_384 {
2292        radix_sort_lms_suffixes_32s_2k(t, sa, induction_bucket, block_start, block_size);
2293        return;
2294    }
2295
2296    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
2297    let threads_usize = usize::try_from(threads)
2298        .expect("threads must be positive")
2299        .min(block_size_usize.max(1));
2300    let omp_block_stride = (block_size_usize / threads_usize) & !15usize;
2301
2302    for omp_thread_num in 0..threads_usize {
2303        let omp_block_start = omp_thread_num * omp_block_stride;
2304        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2305            omp_block_stride
2306        } else {
2307            block_size_usize - omp_block_start
2308        };
2309        if omp_block_size > 0 {
2310            radix_sort_lms_suffixes_32s_block_gather(
2311                t,
2312                sa,
2313                &mut cache[omp_block_start..],
2314                block_start + omp_block_start as FastSint,
2315                omp_block_size as FastSint,
2316            );
2317        }
2318    }
2319
2320    radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache, block_start, block_size);
2321
2322    for omp_thread_num in 0..threads_usize {
2323        let omp_block_start = omp_thread_num * omp_block_stride;
2324        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2325            omp_block_stride
2326        } else {
2327            block_size_usize - omp_block_start
2328        };
2329        if omp_block_size > 0 {
2330            place_cached_suffixes(sa, &cache[omp_block_start..], 0, omp_block_size as FastSint);
2331        }
2332    }
2333}
2334
2335/// Internal helper: radix sort lms suffixes 32s 6k (OpenMP variant).
2336#[doc(hidden)]
2337pub fn radix_sort_lms_suffixes_32s_6k_omp(
2338    t: &[SaSint],
2339    sa: &mut [SaSint],
2340    n: SaSint,
2341    m: SaSint,
2342    induction_bucket: &mut [SaSint],
2343    threads: SaSint,
2344    _thread_state: &mut [ThreadState],
2345) {
2346    if threads <= 1 || m < 65_536 {
2347        radix_sort_lms_suffixes_32s_6k(
2348            t,
2349            sa,
2350            induction_bucket,
2351            n as FastSint - m as FastSint + 1,
2352            m as FastSint - 1,
2353        );
2354        return;
2355    }
2356
2357    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2358    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
2359    let mut block_start = 0usize;
2360    let m_usize = usize::try_from(m).expect("m must be non-negative");
2361    let n_usize = usize::try_from(n).expect("n must be non-negative");
2362    let last = m_usize - 1;
2363
2364    while block_start < last {
2365        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(last);
2366        radix_sort_lms_suffixes_32s_6k_block_omp(
2367            t,
2368            sa,
2369            induction_bucket,
2370            &mut cache,
2371            (n_usize - block_end) as FastSint,
2372            (block_end - block_start) as FastSint,
2373            threads,
2374        );
2375        block_start = block_end;
2376    }
2377}
2378
2379/// Internal helper: radix sort lms suffixes 32s 2k (OpenMP variant).
2380#[doc(hidden)]
2381pub fn radix_sort_lms_suffixes_32s_2k_omp(
2382    t: &[SaSint],
2383    sa: &mut [SaSint],
2384    n: SaSint,
2385    m: SaSint,
2386    induction_bucket: &mut [SaSint],
2387    threads: SaSint,
2388    _thread_state: &mut [ThreadState],
2389) {
2390    if threads <= 1 || m < 65_536 {
2391        radix_sort_lms_suffixes_32s_2k(
2392            t,
2393            sa,
2394            induction_bucket,
2395            n as FastSint - m as FastSint + 1,
2396            m as FastSint - 1,
2397        );
2398        return;
2399    }
2400
2401    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2402    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
2403    let mut block_start = 0usize;
2404    let m_usize = usize::try_from(m).expect("m must be non-negative");
2405    let n_usize = usize::try_from(n).expect("n must be non-negative");
2406    let last = m_usize - 1;
2407
2408    while block_start < last {
2409        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(last);
2410        radix_sort_lms_suffixes_32s_2k_block_omp(
2411            t,
2412            sa,
2413            induction_bucket,
2414            &mut cache,
2415            (n_usize - block_end) as FastSint,
2416            (block_end - block_start) as FastSint,
2417            threads,
2418        );
2419        block_start = block_end;
2420    }
2421}
2422
2423/// Internal helper: radix sort lms suffixes 32s 1k.
2424#[doc(hidden)]
2425pub fn radix_sort_lms_suffixes_32s_1k(
2426    t: &[SaSint],
2427    sa: &mut [SaSint],
2428    n: SaSint,
2429    buckets: &mut [SaSint],
2430) -> SaSint {
2431    let n_usize = usize::try_from(n).expect("n must be non-negative");
2432    let mut i = n as FastSint - 2;
2433    let mut m = 0;
2434    let mut f0 = 1usize;
2435    let mut f1: usize;
2436    let mut c0 = t[n_usize - 1] as FastSint;
2437    let mut c1: FastSint;
2438    let mut c2 = 0 as FastSint;
2439
2440    while i >= 67 {
2441        c1 = t[i as usize] as FastSint;
2442        f1 = usize::from(c1 > (c0 - f0 as FastSint));
2443        if (f1 & !f0) != 0 {
2444            c2 = c0;
2445            buckets[c2 as usize] -= 1;
2446            sa[buckets[c2 as usize] as usize] = (i + 1) as SaSint;
2447            m += 1;
2448        }
2449
2450        c0 = t[(i - 1) as usize] as FastSint;
2451        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2452        if (f0 & !f1) != 0 {
2453            c2 = c1;
2454            buckets[c2 as usize] -= 1;
2455            sa[buckets[c2 as usize] as usize] = i as SaSint;
2456            m += 1;
2457        }
2458
2459        c1 = t[(i - 2) as usize] as FastSint;
2460        f1 = usize::from(c1 > (c0 - f0 as FastSint));
2461        if (f1 & !f0) != 0 {
2462            c2 = c0;
2463            buckets[c2 as usize] -= 1;
2464            sa[buckets[c2 as usize] as usize] = (i - 1) as SaSint;
2465            m += 1;
2466        }
2467
2468        c0 = t[(i - 3) as usize] as FastSint;
2469        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2470        if (f0 & !f1) != 0 {
2471            c2 = c1;
2472            buckets[c2 as usize] -= 1;
2473            sa[buckets[c2 as usize] as usize] = (i - 2) as SaSint;
2474            m += 1;
2475        }
2476
2477        i -= 4;
2478    }
2479
2480    while i >= 0 {
2481        c1 = c0;
2482        c0 = t[i as usize] as FastSint;
2483        f1 = f0;
2484        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2485        if (f0 & !f1) != 0 {
2486            c2 = c1;
2487            buckets[c2 as usize] -= 1;
2488            sa[buckets[c2 as usize] as usize] = (i + 1) as SaSint;
2489            m += 1;
2490        }
2491        i -= 1;
2492    }
2493
2494    if m > 1 {
2495        sa[buckets[c2 as usize] as usize] = 0;
2496    }
2497
2498    m
2499}
2500
2501/// Internal helper: radix sort set markers 32s 6k.
2502#[doc(hidden)]
2503pub fn radix_sort_set_markers_32s_6k(
2504    sa: &mut [SaSint],
2505    induction_bucket: &[SaSint],
2506    omp_block_start: FastSint,
2507    omp_block_size: FastSint,
2508) {
2509    let mut i = omp_block_start;
2510    let mut j = omp_block_start + omp_block_size - 67;
2511
2512    while i < j {
2513        sa[induction_bucket[i as usize] as usize] |= SAINT_MIN;
2514        sa[induction_bucket[(i + 1) as usize] as usize] |= SAINT_MIN;
2515        sa[induction_bucket[(i + 2) as usize] as usize] |= SAINT_MIN;
2516        sa[induction_bucket[(i + 3) as usize] as usize] |= SAINT_MIN;
2517        i += 4;
2518    }
2519
2520    j += 67;
2521    while i < j {
2522        sa[induction_bucket[i as usize] as usize] |= SAINT_MIN;
2523        i += 1;
2524    }
2525}
2526
2527/// Internal helper: radix sort set markers 32s 4k.
2528#[doc(hidden)]
2529pub fn radix_sort_set_markers_32s_4k(
2530    sa: &mut [SaSint],
2531    induction_bucket: &[SaSint],
2532    omp_block_start: FastSint,
2533    omp_block_size: FastSint,
2534) {
2535    let mut i = omp_block_start;
2536    let mut j = omp_block_start + omp_block_size - 67;
2537
2538    while i < j {
2539        sa[induction_bucket[buckets_index2(i as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2540        sa[induction_bucket[buckets_index2((i + 1) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2541        sa[induction_bucket[buckets_index2((i + 2) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2542        sa[induction_bucket[buckets_index2((i + 3) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2543        i += 4;
2544    }
2545
2546    j += 67;
2547    while i < j {
2548        sa[induction_bucket[buckets_index2(i as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2549        i += 1;
2550    }
2551}
2552
2553/// Internal helper: radix sort set markers 32s 6k (OpenMP variant).
2554#[doc(hidden)]
2555pub fn radix_sort_set_markers_32s_6k_omp(
2556    sa: &mut [SaSint],
2557    k: SaSint,
2558    induction_bucket: &[SaSint],
2559    threads: SaSint,
2560) {
2561    if k <= 1 {
2562        return;
2563    }
2564
2565    if threads <= 1 || k < 65_536 {
2566        radix_sort_set_markers_32s_6k(sa, induction_bucket, 0, k as FastSint - 1);
2567        return;
2568    }
2569
2570    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2571    let last = usize::try_from(k - 1).expect("k must be positive");
2572    let stride = (last / threads_usize) & !15usize;
2573    let mut start = 0usize;
2574
2575    for thread in 0..threads_usize {
2576        let end = if thread + 1 == threads_usize {
2577            last
2578        } else {
2579            start + stride
2580        };
2581        if end > start {
2582            radix_sort_set_markers_32s_6k(
2583                sa,
2584                induction_bucket,
2585                start as FastSint,
2586                (end - start) as FastSint,
2587            );
2588        }
2589        start = end;
2590    }
2591}
2592
2593/// Internal helper: radix sort set markers 32s 4k (OpenMP variant).
2594#[doc(hidden)]
2595pub fn radix_sort_set_markers_32s_4k_omp(
2596    sa: &mut [SaSint],
2597    k: SaSint,
2598    induction_bucket: &[SaSint],
2599    threads: SaSint,
2600) {
2601    if k <= 1 {
2602        return;
2603    }
2604
2605    if threads <= 1 || k < 65_536 {
2606        radix_sort_set_markers_32s_4k(sa, induction_bucket, 0, k as FastSint - 1);
2607        return;
2608    }
2609
2610    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2611    let last = usize::try_from(k - 1).expect("k must be positive");
2612    let stride = (last / threads_usize) & !15usize;
2613    let mut start = 0usize;
2614
2615    for thread in 0..threads_usize {
2616        let end = if thread + 1 == threads_usize {
2617            last
2618        } else {
2619            start + stride
2620        };
2621        if end > start {
2622            radix_sort_set_markers_32s_4k(
2623                sa,
2624                induction_bucket,
2625                start as FastSint,
2626                (end - start) as FastSint,
2627            );
2628        }
2629        start = end;
2630    }
2631}
2632
2633/// Internal helper: initialize buckets for partial sorting 8u.
2634#[doc(hidden)]
2635pub fn initialize_buckets_for_partial_sorting_8u(
2636    t: &[u8],
2637    buckets: &mut [SaSint],
2638    first_lms_suffix: SaSint,
2639    left_suffixes_count: SaSint,
2640) {
2641    let temp_offset = 4 * ALPHABET_SIZE;
2642    buckets[buckets_index4(t[first_lms_suffix as usize] as usize, 1)] += 1;
2643
2644    let mut sum0 = left_suffixes_count + 1;
2645    let mut sum1 = 0;
2646    for j in 0..ALPHABET_SIZE {
2647        let i = buckets_index4(j, 0);
2648        let tj = buckets_index2(j, 0);
2649        buckets[temp_offset + tj] = sum0;
2650        sum0 += buckets[i] + buckets[i + 2];
2651        sum1 += buckets[i + 1];
2652        buckets[tj] = sum0;
2653        buckets[tj + 1] = sum1;
2654    }
2655}
2656
2657/// Internal helper: initialize buckets for partial sorting 32s 6k.
2658#[doc(hidden)]
2659pub fn initialize_buckets_for_partial_sorting_32s_6k(
2660    t: &[SaSint],
2661    k: SaSint,
2662    buckets: &mut [SaSint],
2663    first_lms_suffix: SaSint,
2664    left_suffixes_count: SaSint,
2665) {
2666    let k_usize = usize::try_from(k).expect("k must be non-negative");
2667    let temp_offset = 4 * k_usize;
2668    let first_symbol = t[first_lms_suffix as usize] as usize;
2669    let mut sum0 = left_suffixes_count + 1;
2670    let mut sum1 = 0;
2671    let mut sum2 = 0;
2672
2673    for j in 0..first_symbol {
2674        let i = buckets_index4(j, 0);
2675        let tj = buckets_index2(j, 0);
2676        let ss = buckets[i];
2677        let ls = buckets[i + 1];
2678        let sl = buckets[i + 2];
2679        let ll = buckets[i + 3];
2680
2681        buckets[i] = sum0;
2682        buckets[i + 1] = sum2;
2683        buckets[i + 2] = 0;
2684        buckets[i + 3] = 0;
2685
2686        sum0 += ss + sl;
2687        sum1 += ls;
2688        sum2 += ls + ll;
2689
2690        buckets[temp_offset + tj] = sum0;
2691        buckets[temp_offset + tj + 1] = sum1;
2692    }
2693
2694    sum1 += 1;
2695    for j in first_symbol..k_usize {
2696        let i = buckets_index4(j, 0);
2697        let tj = buckets_index2(j, 0);
2698        let ss = buckets[i];
2699        let ls = buckets[i + 1];
2700        let sl = buckets[i + 2];
2701        let ll = buckets[i + 3];
2702
2703        buckets[i] = sum0;
2704        buckets[i + 1] = sum2;
2705        buckets[i + 2] = 0;
2706        buckets[i + 3] = 0;
2707
2708        sum0 += ss + sl;
2709        sum1 += ls;
2710        sum2 += ls + ll;
2711
2712        buckets[temp_offset + tj] = sum0;
2713        buckets[temp_offset + tj + 1] = sum1;
2714    }
2715}
2716
2717/// Internal helper: partial sorting scan left to right 8u.
2718#[doc(hidden)]
2719pub fn partial_sorting_scan_left_to_right_8u(
2720    t: &[u8],
2721    sa: &mut [SaSint],
2722    buckets: &mut [SaSint],
2723    mut d: SaSint,
2724    omp_block_start: FastSint,
2725    omp_block_size: FastSint,
2726) -> SaSint {
2727    let induction_offset = 4 * ALPHABET_SIZE;
2728    let distinct_offset = 2 * ALPHABET_SIZE;
2729    let prefetch_distance = 64 as FastSint;
2730    let mut i = omp_block_start;
2731    let mut j = if omp_block_size > prefetch_distance + 1 {
2732        omp_block_start + omp_block_size - prefetch_distance - 1
2733    } else {
2734        omp_block_start
2735    };
2736
2737    while i < j {
2738        let mut p0 = sa[i as usize];
2739        d += SaSint::from(p0 < 0);
2740        p0 &= SAINT_MAX;
2741        let v0 = buckets_index2(
2742            t[(p0 - 1) as usize] as usize,
2743            usize::from(t[(p0 - 2) as usize] >= t[(p0 - 1) as usize]),
2744        );
2745        let pos0 = buckets[induction_offset + v0] as usize;
2746        sa[pos0] = (p0 - 1) | (((buckets[distinct_offset + v0] != d) as SaSint) << (SAINT_BIT - 1));
2747        buckets[induction_offset + v0] += 1;
2748        buckets[distinct_offset + v0] = d;
2749
2750        let mut p1 = sa[(i + 1) as usize];
2751        d += SaSint::from(p1 < 0);
2752        p1 &= SAINT_MAX;
2753        let v1 = buckets_index2(
2754            t[(p1 - 1) as usize] as usize,
2755            usize::from(t[(p1 - 2) as usize] >= t[(p1 - 1) as usize]),
2756        );
2757        let pos1 = buckets[induction_offset + v1] as usize;
2758        sa[pos1] = (p1 - 1) | (((buckets[distinct_offset + v1] != d) as SaSint) << (SAINT_BIT - 1));
2759        buckets[induction_offset + v1] += 1;
2760        buckets[distinct_offset + v1] = d;
2761
2762        i += 2;
2763    }
2764
2765    j = omp_block_start + omp_block_size;
2766    while i < j {
2767        let mut p = sa[i as usize];
2768        d += SaSint::from(p < 0);
2769        p &= SAINT_MAX;
2770        let v = buckets_index2(
2771            t[(p - 1) as usize] as usize,
2772            usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
2773        );
2774        let pos = buckets[induction_offset + v] as usize;
2775        sa[pos] = (p - 1) | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
2776        buckets[induction_offset + v] += 1;
2777        buckets[distinct_offset + v] = d;
2778        i += 1;
2779    }
2780
2781    d
2782}
2783
2784/// Internal helper: partial sorting scan left to right 8u (OpenMP variant).
2785#[doc(hidden)]
2786pub fn partial_sorting_scan_left_to_right_8u_omp(
2787    t: &[u8],
2788    sa: &mut [SaSint],
2789    n: SaSint,
2790    k: SaSint,
2791    buckets: &mut [SaSint],
2792    left_suffixes_count: SaSint,
2793    mut d: SaSint,
2794    threads: SaSint,
2795    thread_state: &mut [ThreadState],
2796) -> SaSint {
2797    let v = buckets_index2(
2798        t[(n - 1) as usize] as usize,
2799        usize::from(t[(n - 2) as usize] >= t[(n - 1) as usize]),
2800    );
2801    let induction_offset = 4 * ALPHABET_SIZE;
2802    let distinct_offset = 2 * ALPHABET_SIZE;
2803    let pos = buckets[induction_offset + v] as usize;
2804    sa[pos] = (n - 1) | SAINT_MIN;
2805    buckets[induction_offset + v] += 1;
2806    d += 1;
2807    buckets[distinct_offset + v] = d;
2808
2809    if threads == 1 || left_suffixes_count < 65_536 {
2810        return partial_sorting_scan_left_to_right_8u(
2811            t,
2812            sa,
2813            buckets,
2814            d,
2815            0,
2816            left_suffixes_count as FastSint,
2817        );
2818    }
2819
2820    let mut block_start = 0usize;
2821    let left_suffixes_count =
2822        usize::try_from(left_suffixes_count).expect("left_suffixes_count must be non-negative");
2823    let threads_usize = usize::try_from(threads)
2824        .expect("threads must be non-negative")
2825        .min(thread_state.len())
2826        .max(1);
2827    while block_start < left_suffixes_count {
2828        if sa[block_start] == 0 {
2829            block_start += 1;
2830        } else {
2831            let mut block_max_end =
2832                block_start + threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
2833            if block_max_end > left_suffixes_count {
2834                block_max_end = left_suffixes_count;
2835            }
2836            let mut block_end = block_start + 1;
2837            while block_end < block_max_end && sa[block_end] != 0 {
2838                block_end += 1;
2839            }
2840            let block_size = block_end - block_start;
2841
2842            if block_size < 32 {
2843                while block_start < block_end {
2844                    let p = sa[block_start];
2845                    d += SaSint::from(p < 0);
2846                    let p = p & SAINT_MAX;
2847                    let v = buckets_index2(
2848                        t[(p - 1) as usize] as usize,
2849                        usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
2850                    );
2851                    let pos = buckets[induction_offset + v] as usize;
2852                    sa[pos] = (p - 1)
2853                        | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
2854                    buckets[induction_offset + v] += 1;
2855                    buckets[distinct_offset + v] = d;
2856                    block_start += 1;
2857                }
2858            } else {
2859                d = partial_sorting_scan_left_to_right_8u_block_omp(
2860                    t,
2861                    sa,
2862                    k,
2863                    buckets,
2864                    d,
2865                    block_start as FastSint,
2866                    block_size as FastSint,
2867                    threads,
2868                    thread_state,
2869                );
2870                block_start = block_end;
2871            }
2872        }
2873    }
2874
2875    d
2876}
2877
2878/// Internal helper: partial sorting scan left to right 32s 6k.
2879#[doc(hidden)]
2880pub fn partial_sorting_scan_left_to_right_32s_6k(
2881    t: &[SaSint],
2882    sa: &mut [SaSint],
2883    buckets: &mut [SaSint],
2884    mut d: SaSint,
2885    omp_block_start: FastSint,
2886    omp_block_size: FastSint,
2887) -> SaSint {
2888    let prefetch_distance: FastSint = 64;
2889
2890    let mut i = omp_block_start;
2891    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
2892    while i < j {
2893        let mut p0 = sa[i as usize];
2894        d += SaSint::from(p0 < 0);
2895        p0 &= SAINT_MAX;
2896        let p0u = p0 as usize;
2897        let v0 = buckets_index4(t[p0u - 1] as usize, usize::from(t[p0u - 2] >= t[p0u - 1]));
2898        let pos0 = buckets[v0] as usize;
2899        sa[pos0] = (p0 - 1) | (((buckets[2 + v0] != d) as SaSint) << (SAINT_BIT - 1));
2900        buckets[v0] += 1;
2901        buckets[2 + v0] = d;
2902
2903        let mut p1 = sa[(i + 1) as usize];
2904        d += SaSint::from(p1 < 0);
2905        p1 &= SAINT_MAX;
2906        let p1u = p1 as usize;
2907        let v1 = buckets_index4(t[p1u - 1] as usize, usize::from(t[p1u - 2] >= t[p1u - 1]));
2908        let pos1 = buckets[v1] as usize;
2909        sa[pos1] = (p1 - 1) | (((buckets[2 + v1] != d) as SaSint) << (SAINT_BIT - 1));
2910        buckets[v1] += 1;
2911        buckets[2 + v1] = d;
2912
2913        i += 2;
2914    }
2915
2916    j += 2 * prefetch_distance + 1;
2917    while i < j {
2918        let mut p = sa[i as usize];
2919        d += SaSint::from(p < 0);
2920        p &= SAINT_MAX;
2921        let pu = p as usize;
2922        let v = buckets_index4(t[pu - 1] as usize, usize::from(t[pu - 2] >= t[pu - 1]));
2923        let pos = buckets[v] as usize;
2924        sa[pos] = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
2925        buckets[v] += 1;
2926        buckets[2 + v] = d;
2927        i += 1;
2928    }
2929
2930    d
2931}
2932
2933/// Internal helper: partial sorting scan left to right 32s 4k.
2934#[doc(hidden)]
2935pub fn partial_sorting_scan_left_to_right_32s_4k(
2936    t: &[SaSint],
2937    sa: &mut [SaSint],
2938    k: SaSint,
2939    buckets: &mut [SaSint],
2940    mut d: SaSint,
2941    omp_block_start: FastSint,
2942    omp_block_size: FastSint,
2943) -> SaSint {
2944    let k_usize = usize::try_from(k).expect("k must be non-negative");
2945    let prefetch_distance: FastSint = 64;
2946    let induction_offset = 2 * k_usize;
2947    let mut i = omp_block_start;
2948    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
2949
2950    while i < j {
2951        let i0 = i as usize;
2952        let mut p0 = sa[i0];
2953        sa[i0] = p0 & SAINT_MAX;
2954        if p0 > 0 {
2955            sa[i0] = 0;
2956            d += p0 >> (SUFFIX_GROUP_BIT - 1);
2957            p0 &= !SUFFIX_GROUP_MARKER;
2958            let p0u = p0 as usize;
2959            let c0 = t[p0u - 1];
2960            let f0 = usize::from(t[p0u - 2] < c0);
2961            let v0 = buckets_index2(c0 as usize, f0);
2962            let c0u = c0 as usize;
2963            let pos0 = buckets[induction_offset + c0u] as usize;
2964            sa[pos0] = (p0 - 1)
2965                | ((f0 as SaSint) << (SAINT_BIT - 1))
2966                | (((buckets[v0] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
2967            buckets[induction_offset + c0u] += 1;
2968            buckets[v0] = d;
2969        }
2970
2971        let i1 = (i + 1) as usize;
2972        let mut p1 = sa[i1];
2973        sa[i1] = p1 & SAINT_MAX;
2974        if p1 > 0 {
2975            sa[i1] = 0;
2976            d += p1 >> (SUFFIX_GROUP_BIT - 1);
2977            p1 &= !SUFFIX_GROUP_MARKER;
2978            let p1u = p1 as usize;
2979            let c1 = t[p1u - 1];
2980            let f1 = usize::from(t[p1u - 2] < c1);
2981            let v1 = buckets_index2(c1 as usize, f1);
2982            let c1u = c1 as usize;
2983            let pos1 = buckets[induction_offset + c1u] as usize;
2984            sa[pos1] = (p1 - 1)
2985                | ((f1 as SaSint) << (SAINT_BIT - 1))
2986                | (((buckets[v1] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
2987            buckets[induction_offset + c1u] += 1;
2988            buckets[v1] = d;
2989        }
2990
2991        i += 2;
2992    }
2993
2994    j += 2 * prefetch_distance + 1;
2995    while i < j {
2996        let iu = i as usize;
2997        let mut p = sa[iu];
2998        sa[iu] = p & SAINT_MAX;
2999        if p > 0 {
3000            sa[iu] = 0;
3001            d += p >> (SUFFIX_GROUP_BIT - 1);
3002            p &= !SUFFIX_GROUP_MARKER;
3003            let pu = p as usize;
3004            let c = t[pu - 1];
3005            let f = usize::from(t[pu - 2] < c);
3006            let v = buckets_index2(c as usize, f);
3007            let cu = c as usize;
3008            let pos = buckets[induction_offset + cu] as usize;
3009            sa[pos] = (p - 1)
3010                | ((f as SaSint) << (SAINT_BIT - 1))
3011                | (((buckets[v] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3012            buckets[induction_offset + cu] += 1;
3013            buckets[v] = d;
3014        }
3015        i += 1;
3016    }
3017
3018    d
3019}
3020
3021/// Internal helper: partial sorting scan left to right 32s 1k.
3022#[doc(hidden)]
3023pub fn partial_sorting_scan_left_to_right_32s_1k(
3024    t: &[SaSint],
3025    sa: &mut [SaSint],
3026    induction_bucket: &mut [SaSint],
3027    omp_block_start: FastSint,
3028    omp_block_size: FastSint,
3029) {
3030    let prefetch_distance = 64 as FastSint;
3031    let mut i = omp_block_start;
3032    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
3033
3034    while i < j {
3035        let p0 = sa[i as usize];
3036        sa[i as usize] = p0 & SAINT_MAX;
3037        if p0 > 0 {
3038            sa[i as usize] = 0;
3039            let c0 = t[(p0 - 1) as usize] as usize;
3040            let pos0 = induction_bucket[c0] as usize;
3041            induction_bucket[c0] += 1;
3042            sa[pos0] = (p0 - 1)
3043                | ((usize::from(t[(p0 - 2) as usize] < t[(p0 - 1) as usize]) as SaSint)
3044                    << (SAINT_BIT - 1));
3045        }
3046
3047        let p1 = sa[(i + 1) as usize];
3048        sa[(i + 1) as usize] = p1 & SAINT_MAX;
3049        if p1 > 0 {
3050            sa[(i + 1) as usize] = 0;
3051            let c1 = t[(p1 - 1) as usize] as usize;
3052            let pos1 = induction_bucket[c1] as usize;
3053            induction_bucket[c1] += 1;
3054            sa[pos1] = (p1 - 1)
3055                | ((usize::from(t[(p1 - 2) as usize] < t[(p1 - 1) as usize]) as SaSint)
3056                    << (SAINT_BIT - 1));
3057        }
3058
3059        i += 2;
3060    }
3061
3062    j += 2 * prefetch_distance + 1;
3063    while i < j {
3064        let p = sa[i as usize];
3065        sa[i as usize] = p & SAINT_MAX;
3066        if p > 0 {
3067            sa[i as usize] = 0;
3068            let c = t[(p - 1) as usize] as usize;
3069            let pos = induction_bucket[c] as usize;
3070            induction_bucket[c] += 1;
3071            sa[pos] = (p - 1)
3072                | ((usize::from(t[(p - 2) as usize] < t[(p - 1) as usize]) as SaSint)
3073                    << (SAINT_BIT - 1));
3074        }
3075        i += 1;
3076    }
3077}
3078
3079/// Internal helper: partial sorting scan left to right 32s 6k (OpenMP variant).
3080#[doc(hidden)]
3081pub fn partial_sorting_scan_left_to_right_32s_6k_omp(
3082    t: &[SaSint],
3083    sa: &mut [SaSint],
3084    n: SaSint,
3085    buckets: &mut [SaSint],
3086    left_suffixes_count: SaSint,
3087    mut d: SaSint,
3088    threads: SaSint,
3089    thread_state: &mut [ThreadState],
3090) -> SaSint {
3091    let v = buckets_index4(
3092        t[(n - 1) as usize] as usize,
3093        usize::from(t[(n - 2) as usize] >= t[(n - 1) as usize]),
3094    );
3095    let pos = buckets[v] as usize;
3096    sa[pos] = (n - 1) | SAINT_MIN;
3097    buckets[v] += 1;
3098    d += 1;
3099    buckets[2 + v] = d;
3100    if threads == 1 || left_suffixes_count < 65_536 {
3101        return partial_sorting_scan_left_to_right_32s_6k(
3102            t,
3103            sa,
3104            buckets,
3105            d,
3106            0,
3107            left_suffixes_count as FastSint,
3108        );
3109    }
3110    if thread_state.is_empty() {
3111        return partial_sorting_scan_left_to_right_32s_6k(
3112            t,
3113            sa,
3114            buckets,
3115            d,
3116            0,
3117            left_suffixes_count as FastSint,
3118        );
3119    }
3120
3121    let left_suffixes_count =
3122        usize::try_from(left_suffixes_count).expect("left_suffixes_count must be non-negative");
3123    let threads_usize = usize::try_from(threads)
3124        .expect("threads must be non-negative")
3125        .max(1);
3126    let mut block_start = 0usize;
3127    let block_span = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3128    let mut cache = vec![ThreadCache::default(); block_span];
3129    while block_start < left_suffixes_count {
3130        let mut block_end = block_start + block_span;
3131        if block_end > left_suffixes_count {
3132            block_end = left_suffixes_count;
3133        }
3134
3135        d = partial_sorting_scan_left_to_right_32s_6k_block_omp(
3136            t,
3137            sa,
3138            buckets,
3139            d,
3140            &mut cache,
3141            block_start as FastSint,
3142            (block_end - block_start) as FastSint,
3143            threads,
3144        );
3145
3146        block_start = block_end;
3147    }
3148
3149    d
3150}
3151
3152/// Internal helper: partial sorting scan left to right 32s 4k (OpenMP variant).
3153#[doc(hidden)]
3154pub fn partial_sorting_scan_left_to_right_32s_4k_omp(
3155    t: &[SaSint],
3156    sa: &mut [SaSint],
3157    n: SaSint,
3158    k: SaSint,
3159    buckets: &mut [SaSint],
3160    mut d: SaSint,
3161    threads: SaSint,
3162    thread_state: &mut [ThreadState],
3163) -> SaSint {
3164    let k_usize = usize::try_from(k).expect("k must be non-negative");
3165    let induction_offset = 2 * k_usize;
3166    let distinct_offset = 0usize;
3167    let symbol = t[(n - 1) as usize] as usize;
3168    let is_s = usize::from(t[(n - 2) as usize] < t[(n - 1) as usize]);
3169    let pos = buckets[induction_offset + symbol] as usize;
3170    sa[pos] = (n - 1) | ((is_s as SaSint) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
3171    buckets[induction_offset + symbol] += 1;
3172    d += 1;
3173    buckets[distinct_offset + buckets_index2(symbol, is_s)] = d;
3174
3175    if threads == 1 || n < 65_536 {
3176        d = partial_sorting_scan_left_to_right_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
3177    } else {
3178        if thread_state.is_empty() {
3179            return partial_sorting_scan_left_to_right_32s_4k(
3180                t,
3181                sa,
3182                k,
3183                buckets,
3184                d,
3185                0,
3186                n as FastSint,
3187            );
3188        }
3189        let mut block_start = 0usize;
3190        let n_usize = usize::try_from(n).expect("n must be non-negative");
3191        let threads_usize = usize::try_from(threads)
3192            .expect("threads must be non-negative")
3193            .max(1);
3194        let chunk_capacity = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3195        let mut cache = vec![ThreadCache::default(); chunk_capacity];
3196
3197        while block_start < n_usize {
3198            let mut block_end = block_start + chunk_capacity;
3199            if block_end > n_usize {
3200                block_end = n_usize;
3201            }
3202
3203            d = partial_sorting_scan_left_to_right_32s_4k_block_omp(
3204                t,
3205                sa,
3206                k,
3207                buckets,
3208                d,
3209                &mut cache,
3210                block_start as FastSint,
3211                (block_end - block_start) as FastSint,
3212                threads,
3213            );
3214
3215            block_start = block_end;
3216        }
3217    }
3218
3219    d
3220}
3221
3222/// Internal helper: partial sorting scan left to right 32s 1k (OpenMP variant).
3223#[doc(hidden)]
3224pub fn partial_sorting_scan_left_to_right_32s_1k_omp(
3225    t: &[SaSint],
3226    sa: &mut [SaSint],
3227    n: SaSint,
3228    buckets: &mut [SaSint],
3229    threads: SaSint,
3230    thread_state: &mut [ThreadState],
3231) {
3232    let symbol = t[(n - 1) as usize] as usize;
3233    let pos = buckets[symbol] as usize;
3234    sa[pos] = (n - 1)
3235        | ((usize::from(t[(n - 2) as usize] < t[(n - 1) as usize]) as SaSint) << (SAINT_BIT - 1));
3236    buckets[symbol] += 1;
3237    if threads == 1 || n < 65_536 {
3238        partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, 0, n as FastSint);
3239    } else {
3240        if thread_state.is_empty() {
3241            partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, 0, n as FastSint);
3242            return;
3243        }
3244        let n_usize = usize::try_from(n).expect("n must be non-negative");
3245        let threads_usize = usize::try_from(threads)
3246            .expect("threads must be non-negative")
3247            .max(1);
3248        let mut block_start = 0usize;
3249        let block_span = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3250        let mut cache = vec![ThreadCache::default(); block_span];
3251
3252        while block_start < n_usize {
3253            let mut block_end = block_start + block_span;
3254            if block_end > n_usize {
3255                block_end = n_usize;
3256            }
3257
3258            partial_sorting_scan_left_to_right_32s_1k_block_omp(
3259                t,
3260                sa,
3261                buckets,
3262                &mut cache,
3263                block_start as FastSint,
3264                (block_end - block_start) as FastSint,
3265                threads,
3266            );
3267
3268            block_start = block_end;
3269        }
3270    }
3271}
3272
3273/// Internal helper: partial sorting scan left to right 8u block prepare.
3274#[doc(hidden)]
3275pub fn partial_sorting_scan_left_to_right_8u_block_prepare(
3276    t: &[u8],
3277    sa: &[SaSint],
3278    k: SaSint,
3279    buckets: &mut [SaSint],
3280    cache: &mut [ThreadCache],
3281    omp_block_start: FastSint,
3282    omp_block_size: FastSint,
3283) -> (FastSint, FastSint) {
3284    let k_usize = usize::try_from(k).expect("k must be non-negative");
3285    buckets[..2 * k_usize].fill(0);
3286    buckets[2 * k_usize..4 * k_usize].fill(0);
3287
3288    let mut i = omp_block_start;
3289    let mut j = omp_block_start + omp_block_size - 65;
3290    let mut count = 0usize;
3291    let mut d: SaSint = 1;
3292
3293    while i < j {
3294        let mut p0 = sa[i as usize];
3295        cache[count].index = p0;
3296        d += SaSint::from(p0 < 0);
3297        p0 &= SAINT_MAX;
3298        let v0 = buckets_index2(
3299            t[(p0 - 1) as usize] as usize,
3300            usize::from(t[(p0 - 2) as usize] >= t[(p0 - 1) as usize]),
3301        );
3302        cache[count].symbol = v0 as SaSint;
3303        count += 1;
3304        buckets[v0] += 1;
3305        buckets[2 * k_usize + v0] = d;
3306
3307        let mut p1 = sa[(i + 1) as usize];
3308        cache[count].index = p1;
3309        d += SaSint::from(p1 < 0);
3310        p1 &= SAINT_MAX;
3311        let v1 = buckets_index2(
3312            t[(p1 - 1) as usize] as usize,
3313            usize::from(t[(p1 - 2) as usize] >= t[(p1 - 1) as usize]),
3314        );
3315        cache[count].symbol = v1 as SaSint;
3316        count += 1;
3317        buckets[v1] += 1;
3318        buckets[2 * k_usize + v1] = d;
3319
3320        i += 2;
3321    }
3322
3323    j += 65;
3324    while i < j {
3325        let mut p = sa[i as usize];
3326        cache[count].index = p;
3327        d += SaSint::from(p < 0);
3328        p &= SAINT_MAX;
3329        let v = buckets_index2(
3330            t[(p - 1) as usize] as usize,
3331            usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
3332        );
3333        cache[count].symbol = v as SaSint;
3334        count += 1;
3335        buckets[v] += 1;
3336        buckets[2 * k_usize + v] = d;
3337        i += 1;
3338    }
3339
3340    (d as FastSint - 1, count as FastSint)
3341}
3342
3343/// Internal helper: partial sorting scan left to right 8u block place.
3344#[doc(hidden)]
3345pub fn partial_sorting_scan_left_to_right_8u_block_place(
3346    sa: &mut [SaSint],
3347    buckets: &mut [SaSint],
3348    k: SaSint,
3349    cache: &[ThreadCache],
3350    count: FastSint,
3351    mut d: SaSint,
3352) {
3353    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3354    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3355
3356    let mut i = 0usize;
3357    let mut j = usize::try_from(count)
3358        .expect("count must be non-negative")
3359        .saturating_sub(1);
3360    while i < j {
3361        let p0 = cache[i].index;
3362        d += SaSint::from(p0 < 0);
3363        let v0 = cache[i].symbol as usize;
3364        let pos0 = induction_bucket[v0] as usize;
3365        sa[pos0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3366        induction_bucket[v0] += 1;
3367        distinct_names[v0] = d;
3368
3369        let p1 = cache[i + 1].index;
3370        d += SaSint::from(p1 < 0);
3371        let v1 = cache[i + 1].symbol as usize;
3372        let pos1 = induction_bucket[v1] as usize;
3373        sa[pos1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3374        induction_bucket[v1] += 1;
3375        distinct_names[v1] = d;
3376
3377        i += 2;
3378    }
3379
3380    j += 1;
3381    while i < j {
3382        let p = cache[i].index;
3383        d += SaSint::from(p < 0);
3384        let v = cache[i].symbol as usize;
3385        let pos = induction_bucket[v] as usize;
3386        sa[pos] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3387        induction_bucket[v] += 1;
3388        distinct_names[v] = d;
3389        i += 1;
3390    }
3391}
3392
3393/// Internal helper: partial sorting scan left to right 8u block (OpenMP variant).
3394#[doc(hidden)]
3395pub fn partial_sorting_scan_left_to_right_8u_block_omp(
3396    t: &[u8],
3397    sa: &mut [SaSint],
3398    k: SaSint,
3399    buckets: &mut [SaSint],
3400    d: SaSint,
3401    block_start: FastSint,
3402    block_size: FastSint,
3403    threads: SaSint,
3404    thread_state: &mut [ThreadState],
3405) -> SaSint {
3406    let mut d = d;
3407    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
3408    let k_usize = usize::try_from(k).expect("k must be non-negative");
3409    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
3410        usize::try_from(threads)
3411            .expect("threads must be non-negative")
3412            .min(thread_state.len())
3413            .max(1)
3414    } else {
3415        1
3416    };
3417    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
3418
3419    if omp_num_threads == 1 {
3420        return partial_sorting_scan_left_to_right_8u(t, sa, buckets, d, block_start, block_size);
3421    }
3422
3423    for omp_thread_num in 0..omp_num_threads {
3424        let mut omp_block_start = omp_thread_num * omp_block_stride;
3425        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
3426            omp_block_stride
3427        } else {
3428            block_size_usize - omp_block_start
3429        };
3430        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
3431
3432        let state = &mut thread_state[omp_thread_num];
3433        let (position, count) = partial_sorting_scan_left_to_right_8u_block_prepare(
3434            t,
3435            sa,
3436            k,
3437            &mut state.buckets,
3438            &mut state.cache,
3439            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
3440            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
3441        );
3442        state.position = position;
3443        state.count = count;
3444    }
3445
3446    let induction_offset = 4 * ALPHABET_SIZE;
3447    let distinct_offset = 2 * ALPHABET_SIZE;
3448    let (prefix, induction_tail) = buckets.split_at_mut(induction_offset);
3449    let induction_bucket = &mut induction_tail[..2 * k_usize];
3450    let distinct_names = &mut prefix[distinct_offset..distinct_offset + 2 * k_usize];
3451
3452    for tnum in 0..omp_num_threads {
3453        let state = &mut thread_state[tnum];
3454        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
3455        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
3456
3457        for c in 0..2 * k_usize {
3458            let a = induction_bucket[c];
3459            let b = temp_induction_bucket[c];
3460            induction_bucket[c] = a + b;
3461            temp_induction_bucket[c] = a;
3462        }
3463
3464        d -= 1;
3465        for c in 0..2 * k_usize {
3466            let a = distinct_names[c];
3467            let b = temp_distinct_names[c];
3468            let next_d = b + d;
3469            distinct_names[c] = if b > 0 { next_d } else { a };
3470            temp_distinct_names[c] = a;
3471        }
3472        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
3473        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
3474    }
3475
3476    for tnum in 0..omp_num_threads {
3477        let state = &mut thread_state[tnum];
3478        partial_sorting_scan_left_to_right_8u_block_place(
3479            sa,
3480            &mut state.buckets,
3481            k,
3482            &state.cache,
3483            state.count,
3484            state.position as SaSint,
3485        );
3486    }
3487
3488    d
3489}
3490
3491/// Internal helper: partial sorting shift markers 8u (OpenMP variant).
3492#[doc(hidden)]
3493pub fn partial_sorting_shift_markers_8u_omp(
3494    sa: &mut [SaSint],
3495    n: SaSint,
3496    buckets: &[SaSint],
3497    threads: SaSint,
3498) {
3499    let temp_bucket = &buckets[4 * ALPHABET_SIZE..];
3500    let thread_count = if threads > 1 && n >= 65536 {
3501        usize::try_from(threads).expect("threads must be positive")
3502    } else {
3503        1
3504    };
3505    let c_step = buckets_index2(1, 0) as isize;
3506    let c_min = buckets_index2(1, 0) as isize;
3507    let c_max = buckets_index2(ALPHABET_SIZE - 1, 0) as isize;
3508    for t in 0..thread_count {
3509        let mut c = c_max - (t as isize * c_step);
3510        while c >= c_min {
3511            let c_usize = c as usize;
3512            let mut i = temp_bucket[c_usize] as isize - 1;
3513            let mut j = buckets[c_usize - buckets_index2(1, 0)] as isize + 3;
3514            let mut s = SAINT_MIN;
3515
3516            while i >= j {
3517                let p0 = sa[i as usize];
3518                let q0 = (p0 & SAINT_MIN) ^ s;
3519                s ^= q0;
3520                sa[i as usize] = p0 ^ q0;
3521
3522                let p1 = sa[(i - 1) as usize];
3523                let q1 = (p1 & SAINT_MIN) ^ s;
3524                s ^= q1;
3525                sa[(i - 1) as usize] = p1 ^ q1;
3526
3527                let p2 = sa[(i - 2) as usize];
3528                let q2 = (p2 & SAINT_MIN) ^ s;
3529                s ^= q2;
3530                sa[(i - 2) as usize] = p2 ^ q2;
3531
3532                let p3 = sa[(i - 3) as usize];
3533                let q3 = (p3 & SAINT_MIN) ^ s;
3534                s ^= q3;
3535                sa[(i - 3) as usize] = p3 ^ q3;
3536
3537                i -= 4;
3538            }
3539
3540            j -= 3;
3541            while i >= j {
3542                let p = sa[i as usize];
3543                let q = (p & SAINT_MIN) ^ s;
3544                s ^= q;
3545                sa[i as usize] = p ^ q;
3546                i -= 1;
3547            }
3548
3549            c -= c_step * thread_count as isize;
3550        }
3551    }
3552}
3553
3554/// Internal helper: partial sorting shift markers 32s 6k (OpenMP variant).
3555#[doc(hidden)]
3556pub fn partial_sorting_shift_markers_32s_6k_omp(
3557    sa: &mut [SaSint],
3558    k: SaSint,
3559    buckets: &[SaSint],
3560    threads: SaSint,
3561) {
3562    let k_usize = usize::try_from(k).expect("k must be non-negative");
3563    let temp_bucket = &buckets[4 * k_usize..];
3564    let thread_count = if threads > 1 && k >= 65536 {
3565        usize::try_from(threads).expect("threads must be positive")
3566    } else {
3567        1
3568    };
3569    for t in 0..thread_count {
3570        let mut c = k_usize as isize - 1 - t as isize;
3571        while c >= 1 {
3572            let c_usize = c as usize;
3573            let mut i = buckets[buckets_index4(c_usize, 0)] as isize - 1;
3574            let mut j = temp_bucket[buckets_index2(c_usize - 1, 0)] as isize + 3;
3575            let mut s = SAINT_MIN;
3576
3577            while i >= j {
3578                let p0 = sa[i as usize];
3579                let q0 = (p0 & SAINT_MIN) ^ s;
3580                s ^= q0;
3581                sa[i as usize] = p0 ^ q0;
3582
3583                let p1 = sa[(i - 1) as usize];
3584                let q1 = (p1 & SAINT_MIN) ^ s;
3585                s ^= q1;
3586                sa[(i - 1) as usize] = p1 ^ q1;
3587
3588                let p2 = sa[(i - 2) as usize];
3589                let q2 = (p2 & SAINT_MIN) ^ s;
3590                s ^= q2;
3591                sa[(i - 2) as usize] = p2 ^ q2;
3592
3593                let p3 = sa[(i - 3) as usize];
3594                let q3 = (p3 & SAINT_MIN) ^ s;
3595                s ^= q3;
3596                sa[(i - 3) as usize] = p3 ^ q3;
3597
3598                i -= 4;
3599            }
3600
3601            j -= 3;
3602            while i >= j {
3603                let p = sa[i as usize];
3604                let q = (p & SAINT_MIN) ^ s;
3605                s ^= q;
3606                sa[i as usize] = p ^ q;
3607                i -= 1;
3608            }
3609
3610            c -= thread_count as isize;
3611        }
3612    }
3613}
3614
3615/// Internal helper: partial sorting shift markers 32s 4k.
3616#[doc(hidden)]
3617pub fn partial_sorting_shift_markers_32s_4k(sa: &mut [SaSint], n: SaSint) {
3618    let mut i = n as isize - 1;
3619    let mut s = SUFFIX_GROUP_MARKER;
3620    while i >= 3 {
3621        let p0 = sa[i as usize];
3622        let q0 =
3623            ((p0 & SUFFIX_GROUP_MARKER) ^ s) & (((p0 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3624        s ^= q0;
3625        sa[i as usize] = p0 ^ q0;
3626
3627        let p1 = sa[(i - 1) as usize];
3628        let q1 =
3629            ((p1 & SUFFIX_GROUP_MARKER) ^ s) & (((p1 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3630        s ^= q1;
3631        sa[(i - 1) as usize] = p1 ^ q1;
3632
3633        let p2 = sa[(i - 2) as usize];
3634        let q2 =
3635            ((p2 & SUFFIX_GROUP_MARKER) ^ s) & (((p2 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3636        s ^= q2;
3637        sa[(i - 2) as usize] = p2 ^ q2;
3638
3639        let p3 = sa[(i - 3) as usize];
3640        let q3 =
3641            ((p3 & SUFFIX_GROUP_MARKER) ^ s) & (((p3 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3642        s ^= q3;
3643        sa[(i - 3) as usize] = p3 ^ q3;
3644
3645        i -= 4;
3646    }
3647
3648    while i >= 0 {
3649        let p = sa[i as usize];
3650        let q = ((p & SUFFIX_GROUP_MARKER) ^ s) & (((p > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3651        s ^= q;
3652        sa[i as usize] = p ^ q;
3653        i -= 1;
3654    }
3655}
3656
3657/// Internal helper: partial sorting shift buckets 32s 6k.
3658#[doc(hidden)]
3659pub fn partial_sorting_shift_buckets_32s_6k(k: SaSint, buckets: &mut [SaSint]) {
3660    let k_usize = usize::try_from(k).expect("k must be non-negative");
3661    let temp_offset = 4 * k_usize;
3662    for i in 0..k_usize {
3663        let src = buckets_index2(i, 0);
3664        let dst = 2 * src;
3665        buckets[dst] = buckets[temp_offset + src];
3666        buckets[dst + 1] = buckets[temp_offset + src + 1];
3667    }
3668}
3669
3670/// Internal helper: partial sorting scan right to left 8u.
3671#[doc(hidden)]
3672pub fn partial_sorting_scan_right_to_left_8u(
3673    t: &[u8],
3674    sa: &mut [SaSint],
3675    buckets: &mut [SaSint],
3676    mut d: SaSint,
3677    omp_block_start: FastSint,
3678    omp_block_size: FastSint,
3679) -> SaSint {
3680    if omp_block_size <= 0 {
3681        return d;
3682    }
3683
3684    let prefetch_distance = 64usize;
3685    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * ALPHABET_SIZE);
3686    let distinct_names = &mut distinct_names_all[..2 * ALPHABET_SIZE];
3687
3688    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3689    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3690    let mut i = start + size - 1;
3691    let mut j = start + prefetch_distance + 1;
3692
3693    while i >= j {
3694        let mut p0 = sa[i];
3695        d += SaSint::from(p0 < 0);
3696        p0 &= SAINT_MAX;
3697
3698        let p0_usize = p0 as usize;
3699        let v0 = buckets_index2(
3700            t[p0_usize - 1] as usize,
3701            usize::from(t[p0_usize - 2] > t[p0_usize - 1]),
3702        );
3703
3704        induction_bucket[v0] -= 1;
3705        let slot0 = induction_bucket[v0] as usize;
3706        sa[slot0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3707        distinct_names[v0] = d;
3708
3709        let mut p1 = sa[i - 1];
3710        d += SaSint::from(p1 < 0);
3711        p1 &= SAINT_MAX;
3712
3713        let p1_usize = p1 as usize;
3714        let v1 = buckets_index2(
3715            t[p1_usize - 1] as usize,
3716            usize::from(t[p1_usize - 2] > t[p1_usize - 1]),
3717        );
3718
3719        induction_bucket[v1] -= 1;
3720        let slot1 = induction_bucket[v1] as usize;
3721        sa[slot1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3722        distinct_names[v1] = d;
3723
3724        i -= 2;
3725    }
3726
3727    j = if start + prefetch_distance < start + size {
3728        start
3729    } else {
3730        start
3731    };
3732    while i >= j {
3733        let mut p = sa[i];
3734        d += SaSint::from(p < 0);
3735        p &= SAINT_MAX;
3736
3737        let p_usize = p as usize;
3738        let v = buckets_index2(
3739            t[p_usize - 1] as usize,
3740            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3741        );
3742
3743        induction_bucket[v] -= 1;
3744        let slot = induction_bucket[v] as usize;
3745        sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3746        distinct_names[v] = d;
3747
3748        if i == 0 {
3749            break;
3750        }
3751        i -= 1;
3752    }
3753
3754    d
3755}
3756
3757/// Internal helper: partial gsa scan right to left 8u.
3758#[doc(hidden)]
3759pub fn partial_gsa_scan_right_to_left_8u(
3760    t: &[u8],
3761    sa: &mut [SaSint],
3762    buckets: &mut [SaSint],
3763    mut d: SaSint,
3764    omp_block_start: FastSint,
3765    omp_block_size: FastSint,
3766) -> SaSint {
3767    if omp_block_size <= 0 {
3768        return d;
3769    }
3770
3771    let prefetch_distance = 64usize;
3772    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * ALPHABET_SIZE);
3773    let distinct_names = &mut distinct_names_all[..2 * ALPHABET_SIZE];
3774
3775    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3776    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3777    let mut i = start + size - 1;
3778    let mut j = start + prefetch_distance + 1;
3779
3780    while i >= j {
3781        let mut p0 = sa[i];
3782        d += SaSint::from(p0 < 0);
3783        p0 &= SAINT_MAX;
3784
3785        let p0_usize = p0 as usize;
3786        let v0 = buckets_index2(
3787            t[p0_usize - 1] as usize,
3788            usize::from(t[p0_usize - 2] > t[p0_usize - 1]),
3789        );
3790
3791        if v0 != 1 {
3792            induction_bucket[v0] -= 1;
3793            let slot0 = induction_bucket[v0] as usize;
3794            sa[slot0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3795            distinct_names[v0] = d;
3796        }
3797
3798        let mut p1 = sa[i - 1];
3799        d += SaSint::from(p1 < 0);
3800        p1 &= SAINT_MAX;
3801
3802        let p1_usize = p1 as usize;
3803        let v1 = buckets_index2(
3804            t[p1_usize - 1] as usize,
3805            usize::from(t[p1_usize - 2] > t[p1_usize - 1]),
3806        );
3807
3808        if v1 != 1 {
3809            induction_bucket[v1] -= 1;
3810            let slot1 = induction_bucket[v1] as usize;
3811            sa[slot1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3812            distinct_names[v1] = d;
3813        }
3814
3815        i -= 2;
3816    }
3817
3818    j = start;
3819    while i >= j {
3820        let mut p = sa[i];
3821        d += SaSint::from(p < 0);
3822        p &= SAINT_MAX;
3823
3824        let p_usize = p as usize;
3825        let v = buckets_index2(
3826            t[p_usize - 1] as usize,
3827            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3828        );
3829
3830        if v != 1 {
3831            induction_bucket[v] -= 1;
3832            let slot = induction_bucket[v] as usize;
3833            sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3834            distinct_names[v] = d;
3835        }
3836
3837        if i == 0 {
3838            break;
3839        }
3840        i -= 1;
3841    }
3842
3843    d
3844}
3845
3846/// Internal helper: partial sorting scan right to left 8u block prepare.
3847#[doc(hidden)]
3848pub fn partial_sorting_scan_right_to_left_8u_block_prepare(
3849    t: &[u8],
3850    sa: &[SaSint],
3851    k: SaSint,
3852    buckets: &mut [SaSint],
3853    cache: &mut [ThreadCache],
3854    omp_block_start: FastSint,
3855    omp_block_size: FastSint,
3856) -> (FastSint, FastSint) {
3857    let k_usize = usize::try_from(k).expect("k must be non-negative");
3858    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * k_usize);
3859    let distinct_names = &mut distinct_names_all[..2 * k_usize];
3860    induction_bucket.fill(0);
3861    distinct_names.fill(0);
3862
3863    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3864    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3865    let mut count = 0usize;
3866    let mut d = 1;
3867
3868    let mut i = start + size;
3869    while i > start {
3870        i -= 1;
3871
3872        let mut p = sa[i];
3873        cache[count].index = p;
3874        d += SaSint::from(p < 0);
3875        p &= SAINT_MAX;
3876
3877        let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
3878        let v = buckets_index2(
3879            t[p_usize - 1] as usize,
3880            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3881        );
3882
3883        cache[count].symbol = v as SaSint;
3884        induction_bucket[v] += 1;
3885        distinct_names[v] = d;
3886        count += 1;
3887    }
3888
3889    ((d - 1) as FastSint, count as FastSint)
3890}
3891
3892/// Internal helper: partial sorting scan right to left 8u block place.
3893#[doc(hidden)]
3894pub fn partial_sorting_scan_right_to_left_8u_block_place(
3895    sa: &mut [SaSint],
3896    buckets: &mut [SaSint],
3897    k: SaSint,
3898    cache: &[ThreadCache],
3899    count: FastSint,
3900    mut d: SaSint,
3901) {
3902    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3903    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3904
3905    let count = usize::try_from(count).expect("count must be non-negative");
3906    for entry in &cache[..count] {
3907        let p = entry.index;
3908        d += SaSint::from(p < 0);
3909        let v = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
3910        induction_bucket[v] -= 1;
3911        let slot = usize::try_from(induction_bucket[v]).expect("bucket slot must be non-negative");
3912        sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3913        distinct_names[v] = d;
3914    }
3915}
3916
3917/// Internal helper: partial gsa scan right to left 8u block place.
3918#[doc(hidden)]
3919pub fn partial_gsa_scan_right_to_left_8u_block_place(
3920    sa: &mut [SaSint],
3921    buckets: &mut [SaSint],
3922    k: SaSint,
3923    cache: &[ThreadCache],
3924    count: FastSint,
3925    mut d: SaSint,
3926) {
3927    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3928    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3929
3930    let count = usize::try_from(count).expect("count must be non-negative");
3931    for entry in &cache[..count] {
3932        let p = entry.index;
3933        d += SaSint::from(p < 0);
3934        let v = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
3935        if v != 1 {
3936            induction_bucket[v] -= 1;
3937            let slot =
3938                usize::try_from(induction_bucket[v]).expect("bucket slot must be non-negative");
3939            sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3940            distinct_names[v] = d;
3941        }
3942    }
3943}
3944
3945/// Internal helper: partial sorting scan right to left 8u block (OpenMP variant).
3946#[doc(hidden)]
3947pub fn partial_sorting_scan_right_to_left_8u_block_omp(
3948    t: &[u8],
3949    sa: &mut [SaSint],
3950    k: SaSint,
3951    buckets: &mut [SaSint],
3952    d: SaSint,
3953    block_start: FastSint,
3954    block_size: FastSint,
3955    threads: SaSint,
3956    thread_state: &mut [ThreadState],
3957) -> SaSint {
3958    let mut d = d;
3959    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
3960    let k_usize = usize::try_from(k).expect("k must be non-negative");
3961    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
3962        usize::try_from(threads)
3963            .expect("threads must be non-negative")
3964            .min(thread_state.len())
3965            .max(1)
3966    } else {
3967        1
3968    };
3969    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
3970
3971    if omp_num_threads == 1 {
3972        return partial_sorting_scan_right_to_left_8u(t, sa, buckets, d, block_start, block_size);
3973    }
3974
3975    for omp_thread_num in 0..omp_num_threads {
3976        let mut omp_block_start = omp_thread_num * omp_block_stride;
3977        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
3978            omp_block_stride
3979        } else {
3980            block_size_usize - omp_block_start
3981        };
3982        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
3983
3984        let state = &mut thread_state[omp_thread_num];
3985        let (position, count) = partial_sorting_scan_right_to_left_8u_block_prepare(
3986            t,
3987            sa,
3988            k,
3989            &mut state.buckets,
3990            &mut state.cache,
3991            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
3992            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
3993        );
3994        state.position = position;
3995        state.count = count;
3996    }
3997
3998    let distinct_offset = 2 * ALPHABET_SIZE;
3999    let (induction_bucket, distinct_tail) = buckets.split_at_mut(distinct_offset);
4000    let distinct_names = &mut distinct_tail[..2 * k_usize];
4001
4002    for tnum in (0..omp_num_threads).rev() {
4003        let state = &mut thread_state[tnum];
4004        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
4005        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
4006
4007        for c in 0..2 * k_usize {
4008            let a = induction_bucket[c];
4009            let b = temp_induction_bucket[c];
4010            induction_bucket[c] = a - b;
4011            temp_induction_bucket[c] = a;
4012        }
4013
4014        d -= 1;
4015        for c in 0..2 * k_usize {
4016            let a = distinct_names[c];
4017            let b = temp_distinct_names[c];
4018            let next_d = b + d;
4019            distinct_names[c] = if b > 0 { next_d } else { a };
4020            temp_distinct_names[c] = a;
4021        }
4022        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
4023        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
4024    }
4025
4026    for tnum in 0..omp_num_threads {
4027        let state = &mut thread_state[tnum];
4028        partial_sorting_scan_right_to_left_8u_block_place(
4029            sa,
4030            &mut state.buckets,
4031            k,
4032            &state.cache,
4033            state.count,
4034            state.position as SaSint,
4035        );
4036    }
4037
4038    d
4039}
4040
4041/// Internal helper: partial gsa scan right to left 8u block (OpenMP variant).
4042#[doc(hidden)]
4043pub fn partial_gsa_scan_right_to_left_8u_block_omp(
4044    t: &[u8],
4045    sa: &mut [SaSint],
4046    k: SaSint,
4047    buckets: &mut [SaSint],
4048    d: SaSint,
4049    block_start: FastSint,
4050    block_size: FastSint,
4051    threads: SaSint,
4052    thread_state: &mut [ThreadState],
4053) -> SaSint {
4054    let mut d = d;
4055    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4056    let k_usize = usize::try_from(k).expect("k must be non-negative");
4057    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
4058        usize::try_from(threads)
4059            .expect("threads must be non-negative")
4060            .min(thread_state.len())
4061            .max(1)
4062    } else {
4063        1
4064    };
4065    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4066
4067    if omp_num_threads == 1 {
4068        return partial_gsa_scan_right_to_left_8u(t, sa, buckets, d, block_start, block_size);
4069    }
4070
4071    for omp_thread_num in 0..omp_num_threads {
4072        let mut omp_block_start = omp_thread_num * omp_block_stride;
4073        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4074            omp_block_stride
4075        } else {
4076            block_size_usize - omp_block_start
4077        };
4078        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
4079
4080        let state = &mut thread_state[omp_thread_num];
4081        let (position, count) = partial_sorting_scan_right_to_left_8u_block_prepare(
4082            t,
4083            sa,
4084            k,
4085            &mut state.buckets,
4086            &mut state.cache,
4087            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
4088            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
4089        );
4090        state.position = position;
4091        state.count = count;
4092    }
4093
4094    let distinct_offset = 2 * ALPHABET_SIZE;
4095    let (induction_bucket, distinct_tail) = buckets.split_at_mut(distinct_offset);
4096    let distinct_names = &mut distinct_tail[..2 * k_usize];
4097
4098    for tnum in (0..omp_num_threads).rev() {
4099        let state = &mut thread_state[tnum];
4100        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
4101        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
4102
4103        for c in 0..2 * k_usize {
4104            let a = induction_bucket[c];
4105            let b = temp_induction_bucket[c];
4106            induction_bucket[c] = a - b;
4107            temp_induction_bucket[c] = a;
4108        }
4109
4110        d -= 1;
4111        for c in 0..2 * k_usize {
4112            let a = distinct_names[c];
4113            let b = temp_distinct_names[c];
4114            let next_d = b + d;
4115            distinct_names[c] = if b > 0 { next_d } else { a };
4116            temp_distinct_names[c] = a;
4117        }
4118        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
4119        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
4120    }
4121
4122    for tnum in 0..omp_num_threads {
4123        let state = &mut thread_state[tnum];
4124        partial_gsa_scan_right_to_left_8u_block_place(
4125            sa,
4126            &mut state.buckets,
4127            k,
4128            &state.cache,
4129            state.count,
4130            state.position as SaSint,
4131        );
4132    }
4133
4134    d
4135}
4136
4137/// Internal helper: partial sorting scan right to left 8u (OpenMP variant).
4138#[doc(hidden)]
4139pub fn partial_sorting_scan_right_to_left_8u_omp(
4140    t: &[u8],
4141    sa: &mut [SaSint],
4142    n: SaSint,
4143    k: SaSint,
4144    buckets: &mut [SaSint],
4145    first_lms_suffix: SaSint,
4146    left_suffixes_count: SaSint,
4147    mut d: SaSint,
4148    threads: SaSint,
4149    thread_state: &mut [ThreadState],
4150) {
4151    let scan_start = left_suffixes_count as FastSint + 1;
4152    let scan_end = n as FastSint - first_lms_suffix as FastSint;
4153
4154    if threads == 1 || (scan_end - scan_start) < 65_536 {
4155        let _ = partial_sorting_scan_right_to_left_8u(
4156            t,
4157            sa,
4158            buckets,
4159            d,
4160            scan_start,
4161            scan_end - scan_start,
4162        );
4163        return;
4164    }
4165
4166    let distinct_offset = 2 * ALPHABET_SIZE;
4167
4168    let mut block_start = usize::try_from(scan_end - 1).expect("scan end must be positive");
4169    let scan_start_usize = usize::try_from(scan_start).expect("scan_start must be non-negative");
4170    let threads_usize = usize::try_from(threads)
4171        .expect("threads must be non-negative")
4172        .min(thread_state.len())
4173        .max(1);
4174
4175    while block_start >= scan_start_usize {
4176        if sa[block_start] == 0 {
4177            if block_start == 0 {
4178                break;
4179            }
4180            block_start -= 1;
4181        } else {
4182            let mut block_max_end = block_start.saturating_sub(
4183                threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize),
4184            );
4185            if block_max_end + 1 < scan_start_usize {
4186                block_max_end = scan_start_usize.saturating_sub(1);
4187            }
4188            let mut block_end = block_start - 1;
4189            while block_end > block_max_end && sa[block_end] != 0 {
4190                block_end -= 1;
4191            }
4192            let block_size = block_start - block_end;
4193
4194            if block_size < 32 {
4195                while block_start > block_end {
4196                    let p = sa[block_start];
4197                    d += SaSint::from(p < 0);
4198                    let p = p & SAINT_MAX;
4199                    let v = buckets_index2(
4200                        t[(p - 1) as usize] as usize,
4201                        usize::from(t[(p - 2) as usize] > t[(p - 1) as usize]),
4202                    );
4203                    buckets[v] -= 1;
4204                    let slot =
4205                        usize::try_from(buckets[v]).expect("bucket slot must be non-negative");
4206                    sa[slot] = (p - 1)
4207                        | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
4208                    buckets[distinct_offset + v] = d;
4209
4210                    if block_start == 0 {
4211                        break;
4212                    }
4213                    block_start -= 1;
4214                }
4215            } else {
4216                d = partial_sorting_scan_right_to_left_8u_block_omp(
4217                    t,
4218                    sa,
4219                    k,
4220                    buckets,
4221                    d,
4222                    FastSint::try_from(block_end + 1).expect("block start must fit FastSint"),
4223                    FastSint::try_from(block_size).expect("block size must fit FastSint"),
4224                    threads,
4225                    thread_state,
4226                );
4227                block_start = block_end;
4228            }
4229        }
4230    }
4231}
4232
4233/// Internal helper: partial gsa scan right to left 8u (OpenMP variant).
4234#[doc(hidden)]
4235pub fn partial_gsa_scan_right_to_left_8u_omp(
4236    t: &[u8],
4237    sa: &mut [SaSint],
4238    n: SaSint,
4239    k: SaSint,
4240    buckets: &mut [SaSint],
4241    first_lms_suffix: SaSint,
4242    left_suffixes_count: SaSint,
4243    mut d: SaSint,
4244    threads: SaSint,
4245    thread_state: &mut [ThreadState],
4246) {
4247    let scan_start = left_suffixes_count as FastSint + 1;
4248    let scan_end = n as FastSint - first_lms_suffix as FastSint;
4249
4250    if threads == 1 || (scan_end - scan_start) < 65_536 {
4251        let _ =
4252            partial_gsa_scan_right_to_left_8u(t, sa, buckets, d, scan_start, scan_end - scan_start);
4253        return;
4254    }
4255
4256    let distinct_offset = 2 * ALPHABET_SIZE;
4257    let mut block_start = usize::try_from(scan_end - 1).expect("scan end must be positive");
4258    let scan_start_usize = usize::try_from(scan_start).expect("scan_start must be non-negative");
4259    let threads_usize = usize::try_from(threads)
4260        .expect("threads must be non-negative")
4261        .min(thread_state.len())
4262        .max(1);
4263
4264    while block_start >= scan_start_usize {
4265        if sa[block_start] == 0 {
4266            if block_start == 0 {
4267                break;
4268            }
4269            block_start -= 1;
4270        } else {
4271            let mut block_max_end = block_start.saturating_sub(
4272                threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize),
4273            );
4274            if block_max_end + 1 < scan_start_usize {
4275                block_max_end = scan_start_usize.saturating_sub(1);
4276            }
4277            let mut block_end = block_start - 1;
4278            while block_end > block_max_end && sa[block_end] != 0 {
4279                block_end -= 1;
4280            }
4281            let block_size = block_start - block_end;
4282
4283            if block_size < 32 {
4284                while block_start > block_end {
4285                    let p = sa[block_start];
4286                    d += SaSint::from(p < 0);
4287                    let p = p & SAINT_MAX;
4288                    let v = buckets_index2(
4289                        t[(p - 1) as usize] as usize,
4290                        usize::from(t[(p - 2) as usize] > t[(p - 1) as usize]),
4291                    );
4292                    if v != 1 {
4293                        buckets[v] -= 1;
4294                        let slot =
4295                            usize::try_from(buckets[v]).expect("bucket slot must be non-negative");
4296                        sa[slot] = (p - 1)
4297                            | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
4298                        buckets[distinct_offset + v] = d;
4299                    }
4300
4301                    if block_start == 0 {
4302                        break;
4303                    }
4304                    block_start -= 1;
4305                }
4306            } else {
4307                d = partial_gsa_scan_right_to_left_8u_block_omp(
4308                    t,
4309                    sa,
4310                    k,
4311                    buckets,
4312                    d,
4313                    FastSint::try_from(block_end + 1).expect("block start must fit FastSint"),
4314                    FastSint::try_from(block_size).expect("block size must fit FastSint"),
4315                    threads,
4316                    thread_state,
4317                );
4318                block_start = block_end;
4319            }
4320        }
4321    }
4322}
4323
4324/// Internal helper: partial sorting scan right to left 32s 6k.
4325#[doc(hidden)]
4326pub fn partial_sorting_scan_right_to_left_32s_6k(
4327    t: &[SaSint],
4328    sa: &mut [SaSint],
4329    buckets: &mut [SaSint],
4330    mut d: SaSint,
4331    omp_block_start: FastSint,
4332    omp_block_size: FastSint,
4333) -> SaSint {
4334    if omp_block_size <= 0 {
4335        return d;
4336    }
4337
4338    let prefetch_distance: FastSint = 64;
4339    let mut i = omp_block_start + omp_block_size - 1;
4340    let mut j = omp_block_start + 2 * prefetch_distance + 1;
4341
4342    while i >= j {
4343        let mut p0 = sa[i as usize];
4344        d += SaSint::from(p0 < 0);
4345        p0 &= SAINT_MAX;
4346        let p0u = p0 as usize;
4347        let v0 = buckets_index4(t[p0u - 1] as usize, usize::from(t[p0u - 2] > t[p0u - 1]));
4348        buckets[v0] -= 1;
4349        let slot0 = buckets[v0] as usize;
4350        sa[slot0] = (p0 - 1) | (((buckets[2 + v0] != d) as SaSint) << (SAINT_BIT - 1));
4351        buckets[2 + v0] = d;
4352
4353        let mut p1 = sa[(i - 1) as usize];
4354        d += SaSint::from(p1 < 0);
4355        p1 &= SAINT_MAX;
4356        let p1u = p1 as usize;
4357        let v1 = buckets_index4(t[p1u - 1] as usize, usize::from(t[p1u - 2] > t[p1u - 1]));
4358        buckets[v1] -= 1;
4359        let slot1 = buckets[v1] as usize;
4360        sa[slot1] = (p1 - 1) | (((buckets[2 + v1] != d) as SaSint) << (SAINT_BIT - 1));
4361        buckets[2 + v1] = d;
4362
4363        i -= 2;
4364    }
4365
4366    j -= 2 * prefetch_distance + 1;
4367    while i >= j {
4368        let mut p = sa[i as usize];
4369        d += SaSint::from(p < 0);
4370        p &= SAINT_MAX;
4371        let pu = p as usize;
4372        let v = buckets_index4(t[pu - 1] as usize, usize::from(t[pu - 2] > t[pu - 1]));
4373
4374        buckets[v] -= 1;
4375        let slot = buckets[v] as usize;
4376        sa[slot] = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
4377        buckets[2 + v] = d;
4378        i -= 1;
4379    }
4380
4381    d
4382}
4383
4384/// Internal helper: partial sorting scan right to left 32s 4k.
4385#[doc(hidden)]
4386pub fn partial_sorting_scan_right_to_left_32s_4k(
4387    t: &[SaSint],
4388    sa: &mut [SaSint],
4389    k: SaSint,
4390    buckets: &mut [SaSint],
4391    mut d: SaSint,
4392    omp_block_start: FastSint,
4393    omp_block_size: FastSint,
4394) -> SaSint {
4395    if omp_block_size <= 0 {
4396        return d;
4397    }
4398
4399    let k_usize = usize::try_from(k).expect("k must be non-negative");
4400    let prefetch_distance: FastSint = 64;
4401    let induction_offset = 3 * k_usize;
4402
4403    let mut i = omp_block_start + omp_block_size - 1;
4404    let mut j = omp_block_start + 2 * prefetch_distance + 1;
4405
4406    while i >= j {
4407        let i0 = i as usize;
4408        let mut p0 = sa[i0];
4409        if p0 > 0 {
4410            sa[i0] = 0;
4411            d += p0 >> (SUFFIX_GROUP_BIT - 1);
4412            p0 &= !SUFFIX_GROUP_MARKER;
4413
4414            let p0u = p0 as usize;
4415            let c0 = t[p0u - 1];
4416            let f0 = usize::from(t[p0u - 2] > c0);
4417            let v0 = buckets_index2(c0 as usize, f0);
4418            let c0u = c0 as usize;
4419            buckets[induction_offset + c0u] -= 1;
4420            let slot0 = buckets[induction_offset + c0u] as usize;
4421            sa[slot0] = (p0 - 1)
4422                | ((f0 as SaSint) << (SAINT_BIT - 1))
4423                | (((buckets[v0] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4424            buckets[v0] = d;
4425        }
4426
4427        let i1 = (i - 1) as usize;
4428        let mut p1 = sa[i1];
4429        if p1 > 0 {
4430            sa[i1] = 0;
4431            d += p1 >> (SUFFIX_GROUP_BIT - 1);
4432            p1 &= !SUFFIX_GROUP_MARKER;
4433
4434            let p1u = p1 as usize;
4435            let c1 = t[p1u - 1];
4436            let f1 = usize::from(t[p1u - 2] > c1);
4437            let v1 = buckets_index2(c1 as usize, f1);
4438            let c1u = c1 as usize;
4439            buckets[induction_offset + c1u] -= 1;
4440            let slot1 = buckets[induction_offset + c1u] as usize;
4441            sa[slot1] = (p1 - 1)
4442                | ((f1 as SaSint) << (SAINT_BIT - 1))
4443                | (((buckets[v1] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4444            buckets[v1] = d;
4445        }
4446
4447        i -= 2;
4448    }
4449
4450    j -= 2 * prefetch_distance + 1;
4451    while i >= j {
4452        let iu = i as usize;
4453        let mut p = sa[iu];
4454        if p > 0 {
4455            sa[iu] = 0;
4456            d += p >> (SUFFIX_GROUP_BIT - 1);
4457            p &= !SUFFIX_GROUP_MARKER;
4458
4459            let pu = p as usize;
4460            let c = t[pu - 1];
4461            let f = usize::from(t[pu - 2] > c);
4462            let v = buckets_index2(c as usize, f);
4463            let cu = c as usize;
4464            buckets[induction_offset + cu] -= 1;
4465            let slot = buckets[induction_offset + cu] as usize;
4466            sa[slot] = (p - 1)
4467                | ((f as SaSint) << (SAINT_BIT - 1))
4468                | (((buckets[v] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4469            buckets[v] = d;
4470        }
4471        i -= 1;
4472    }
4473
4474    d
4475}
4476
4477/// Internal helper: partial sorting scan right to left 32s 1k.
4478#[doc(hidden)]
4479pub fn partial_sorting_scan_right_to_left_32s_1k(
4480    t: &[SaSint],
4481    sa: &mut [SaSint],
4482    induction_bucket: &mut [SaSint],
4483    omp_block_start: FastSint,
4484    omp_block_size: FastSint,
4485) {
4486    if omp_block_size <= 0 {
4487        return;
4488    }
4489
4490    let prefetch_distance = 64usize;
4491    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4492    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4493    let mut i = (start + size - 1) as isize;
4494    let mut j = (start + 2 * prefetch_distance + 1) as isize;
4495
4496    while i >= j {
4497        let p0 = sa[i as usize];
4498        if p0 > 0 {
4499            sa[i as usize] = 0;
4500            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
4501            let bucket_index0 =
4502                usize::try_from(t[p0_usize - 1]).expect("bucket symbol must be non-negative");
4503            induction_bucket[bucket_index0] -= 1;
4504            let slot0 = usize::try_from(induction_bucket[bucket_index0])
4505                .expect("bucket slot must be non-negative");
4506            sa[slot0] = (p0 - 1)
4507                | ((usize::from(t[p0_usize - 2] > t[p0_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4508        }
4509        let p1 = sa[(i - 1) as usize];
4510        if p1 > 0 {
4511            sa[(i - 1) as usize] = 0;
4512            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
4513            let bucket_index1 =
4514                usize::try_from(t[p1_usize - 1]).expect("bucket symbol must be non-negative");
4515            induction_bucket[bucket_index1] -= 1;
4516            let slot1 = usize::try_from(induction_bucket[bucket_index1])
4517                .expect("bucket slot must be non-negative");
4518            sa[slot1] = (p1 - 1)
4519                | ((usize::from(t[p1_usize - 2] > t[p1_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4520        }
4521
4522        i -= 2;
4523    }
4524
4525    j -= (2 * prefetch_distance + 1) as isize;
4526    while i >= j {
4527        let p = sa[i as usize];
4528        if p > 0 {
4529            sa[i as usize] = 0;
4530            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4531            let bucket_index =
4532                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative");
4533            induction_bucket[bucket_index] -= 1;
4534            let slot = usize::try_from(induction_bucket[bucket_index])
4535                .expect("bucket slot must be non-negative");
4536            sa[slot] = (p - 1)
4537                | ((usize::from(t[p_usize - 2] > t[p_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4538        }
4539        if i == 0 {
4540            break;
4541        }
4542        i -= 1;
4543    }
4544}
4545
4546/// Internal helper: partial sorting scan right to left 32s 6k block gather.
4547#[doc(hidden)]
4548pub fn partial_sorting_scan_right_to_left_32s_6k_block_gather(
4549    t: &[SaSint],
4550    sa: &[SaSint],
4551    cache: &mut [ThreadCache],
4552    omp_block_start: FastSint,
4553    omp_block_size: FastSint,
4554) {
4555    if omp_block_size <= 0 {
4556        return;
4557    }
4558
4559    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4560    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4561    for offset in 0..size {
4562        let i = start + offset;
4563        let mut p = sa[i];
4564        let mut symbol = 0usize;
4565        p &= SAINT_MAX;
4566        if p != 0 {
4567            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4568            symbol = buckets_index4(
4569                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
4570                usize::from(t[p_usize - 2] > t[p_usize - 1]),
4571            );
4572        }
4573        cache[offset].index = sa[i];
4574        cache[offset].symbol = symbol as SaSint;
4575    }
4576}
4577
4578/// Internal helper: partial sorting scan right to left 32s 4k block gather.
4579#[doc(hidden)]
4580pub fn partial_sorting_scan_right_to_left_32s_4k_block_gather(
4581    t: &[SaSint],
4582    sa: &mut [SaSint],
4583    cache: &mut [ThreadCache],
4584    omp_block_start: FastSint,
4585    omp_block_size: FastSint,
4586) {
4587    if omp_block_size <= 0 {
4588        return;
4589    }
4590
4591    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4592    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4593    for offset in 0..size {
4594        let i = start + offset;
4595        let mut symbol = SAINT_MIN;
4596        let mut p = sa[i];
4597        if p > 0 {
4598            sa[i] = 0;
4599            cache[offset].index = p;
4600            p &= !SUFFIX_GROUP_MARKER;
4601            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4602            symbol = buckets_index2(
4603                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
4604                usize::from(t[p_usize - 2] > t[p_usize - 1]),
4605            ) as SaSint;
4606        }
4607        cache[offset].symbol = symbol;
4608    }
4609}
4610
4611/// Internal helper: partial sorting scan right to left 32s 1k block gather.
4612#[doc(hidden)]
4613pub fn partial_sorting_scan_right_to_left_32s_1k_block_gather(
4614    t: &[SaSint],
4615    sa: &mut [SaSint],
4616    cache: &mut [ThreadCache],
4617    omp_block_start: FastSint,
4618    omp_block_size: FastSint,
4619) {
4620    if omp_block_size <= 0 {
4621        return;
4622    }
4623    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4624    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4625    for offset in 0..size {
4626        let i = start + offset;
4627        let mut symbol = SAINT_MIN;
4628        let p = sa[i];
4629        if p > 0 {
4630            sa[i] = 0;
4631            cache[offset].index = (p - 1)
4632                | ((usize::from(t[p as usize - 2] > t[p as usize - 1]) as SaSint)
4633                    << (SAINT_BIT - 1));
4634            symbol = t[p as usize - 1];
4635        }
4636        cache[offset].symbol = symbol;
4637    }
4638}
4639
4640/// Internal helper: partial sorting scan right to left 32s 6k block sort.
4641#[doc(hidden)]
4642pub fn partial_sorting_scan_right_to_left_32s_6k_block_sort(
4643    t: &[SaSint],
4644    buckets: &mut [SaSint],
4645    mut d: SaSint,
4646    cache: &mut [ThreadCache],
4647    omp_block_start: FastSint,
4648    omp_block_size: FastSint,
4649) -> SaSint {
4650    if omp_block_size <= 0 {
4651        return d;
4652    }
4653
4654    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4655    let mut i = size;
4656    while i > 0 {
4657        i -= 1;
4658
4659        let v = usize::try_from(cache[i].symbol).expect("cache symbol must be non-negative");
4660        let p = cache[i].index;
4661        d += SaSint::from(p < 0);
4662        buckets[v] -= 1;
4663        let target = buckets[v];
4664        cache[i].symbol = target;
4665        cache[i].index = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
4666        buckets[2 + v] = d;
4667
4668        if target >= omp_block_start as SaSint
4669            && target < (omp_block_start + omp_block_size) as SaSint
4670        {
4671            let s = usize::try_from(target - omp_block_start as SaSint)
4672                .expect("cache slot must be non-negative");
4673            let q = cache[i].index & SAINT_MAX;
4674            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
4675            cache[s].index = cache[i].index;
4676            cache[s].symbol = buckets_index4(
4677                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
4678                usize::from(t[q_usize - 2] > t[q_usize - 1]),
4679            ) as SaSint;
4680        }
4681    }
4682
4683    d
4684}
4685
4686/// Internal helper: partial sorting scan right to left 32s 4k block sort.
4687#[doc(hidden)]
4688pub fn partial_sorting_scan_right_to_left_32s_4k_block_sort(
4689    t: &[SaSint],
4690    k: SaSint,
4691    buckets: &mut [SaSint],
4692    mut d: SaSint,
4693    cache: &mut [ThreadCache],
4694    omp_block_start: FastSint,
4695    omp_block_size: FastSint,
4696) -> SaSint {
4697    if omp_block_size <= 0 {
4698        return d;
4699    }
4700
4701    let k_usize = usize::try_from(k).expect("k must be non-negative");
4702    let (distinct_names, tail) = buckets.split_at_mut(2 * k_usize);
4703    let induction_bucket = &mut tail[k_usize..2 * k_usize];
4704
4705    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4706    let mut i = size;
4707    while i > 0 {
4708        i -= 1;
4709
4710        let v = cache[i].symbol;
4711        if v >= 0 {
4712            let p = cache[i].index;
4713            d += p >> (SUFFIX_GROUP_BIT - 1);
4714            let bucket_index = usize::try_from(v >> 1).expect("bucket symbol must be non-negative");
4715            induction_bucket[bucket_index] -= 1;
4716            let target = induction_bucket[bucket_index];
4717            cache[i].symbol = target;
4718            cache[i].index = (p - 1)
4719                | ((v & 1) << (SAINT_BIT - 1))
4720                | (((distinct_names
4721                    [usize::try_from(v).expect("bucket symbol must be non-negative")]
4722                    != d) as SaSint)
4723                    << (SUFFIX_GROUP_BIT - 1));
4724            distinct_names[usize::try_from(v).expect("bucket symbol must be non-negative")] = d;
4725
4726            if target >= omp_block_start as SaSint
4727                && target < (omp_block_start + omp_block_size) as SaSint
4728            {
4729                let ni = usize::try_from(target - omp_block_start as SaSint)
4730                    .expect("cache slot must be non-negative");
4731                let mut np = cache[i].index;
4732                if np > 0 {
4733                    cache[i].index = 0;
4734                    cache[ni].index = np;
4735                    np &= !SUFFIX_GROUP_MARKER;
4736                    let np_usize = usize::try_from(np).expect("suffix index must be non-negative");
4737                    cache[ni].symbol = buckets_index2(
4738                        usize::try_from(t[np_usize - 1])
4739                            .expect("bucket symbol must be non-negative"),
4740                        usize::from(t[np_usize - 2] > t[np_usize - 1]),
4741                    ) as SaSint;
4742                }
4743            }
4744        }
4745    }
4746
4747    d
4748}
4749
4750/// Internal helper: partial sorting scan right to left 32s 1k block sort.
4751#[doc(hidden)]
4752pub fn partial_sorting_scan_right_to_left_32s_1k_block_sort(
4753    t: &[SaSint],
4754    induction_bucket: &mut [SaSint],
4755    cache: &mut [ThreadCache],
4756    omp_block_start: FastSint,
4757    omp_block_size: FastSint,
4758) {
4759    if omp_block_size <= 0 {
4760        return;
4761    }
4762    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4763    let mut offset = size;
4764
4765    while offset > 0 {
4766        offset -= 1;
4767        let v = cache[offset].symbol;
4768        if v >= 0 {
4769            let bucket_index = v as usize;
4770            induction_bucket[bucket_index] -= 1;
4771            let target = induction_bucket[bucket_index];
4772            cache[offset].symbol = target;
4773            let block_end = omp_block_start as SaSint + omp_block_size as SaSint;
4774            if target >= omp_block_start as SaSint && target < block_end {
4775                let ni = usize::try_from(target - omp_block_start as SaSint)
4776                    .expect("cache slot must be non-negative");
4777                let np = cache[offset].index;
4778                if np > 0 {
4779                    cache[offset].index = 0;
4780                    cache[ni].index = (np - 1)
4781                        | ((usize::from(t[np as usize - 2] > t[np as usize - 1]) as SaSint)
4782                            << (SAINT_BIT - 1));
4783                    cache[ni].symbol = t[np as usize - 1];
4784                }
4785            }
4786        }
4787    }
4788}
4789
4790/// Internal helper: partial sorting scan right to left 32s 6k block (OpenMP variant).
4791#[doc(hidden)]
4792pub fn partial_sorting_scan_right_to_left_32s_6k_block_omp(
4793    t: &[SaSint],
4794    sa: &mut [SaSint],
4795    buckets: &mut [SaSint],
4796    mut d: SaSint,
4797    cache: &mut [ThreadCache],
4798    block_start: FastSint,
4799    block_size: FastSint,
4800    threads: SaSint,
4801) -> SaSint {
4802    if block_size <= 0 {
4803        return d;
4804    }
4805    if threads == 1 || block_size < 16_384 {
4806        return partial_sorting_scan_right_to_left_32s_6k(
4807            t,
4808            sa,
4809            buckets,
4810            d,
4811            block_start,
4812            block_size,
4813        );
4814    }
4815
4816    let threads_usize = usize::try_from(threads)
4817        .expect("threads must be non-negative")
4818        .max(1);
4819    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4820    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4821    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4822
4823    for omp_thread_num in 0..omp_num_threads {
4824        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4825            omp_block_stride
4826        } else {
4827            block_size_usize - omp_thread_num * omp_block_stride
4828        };
4829        let omp_block_start = usize::try_from(block_start)
4830            .expect("block_start must be non-negative")
4831            + omp_thread_num * omp_block_stride;
4832        if omp_block_size > 0 {
4833            partial_sorting_scan_right_to_left_32s_6k_block_gather(
4834                t,
4835                sa,
4836                &mut cache[omp_thread_num * omp_block_stride
4837                    ..omp_thread_num * omp_block_stride + omp_block_size],
4838                omp_block_start as FastSint,
4839                omp_block_size as FastSint,
4840            );
4841        }
4842    }
4843
4844    d = partial_sorting_scan_right_to_left_32s_6k_block_sort(
4845        t,
4846        buckets,
4847        d,
4848        &mut cache[..block_size_usize],
4849        block_start,
4850        block_size,
4851    );
4852
4853    for omp_thread_num in 0..omp_num_threads {
4854        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4855            omp_block_stride
4856        } else {
4857            block_size_usize - omp_thread_num * omp_block_stride
4858        };
4859        let cache_start = omp_thread_num * omp_block_stride;
4860        if omp_block_size > 0 {
4861            place_cached_suffixes(sa, &cache[cache_start..], 0, omp_block_size as FastSint);
4862        }
4863    }
4864
4865    d
4866}
4867
4868/// Internal helper: partial sorting scan right to left 32s 4k block (OpenMP variant).
4869#[doc(hidden)]
4870pub fn partial_sorting_scan_right_to_left_32s_4k_block_omp(
4871    t: &[SaSint],
4872    sa: &mut [SaSint],
4873    k: SaSint,
4874    buckets: &mut [SaSint],
4875    mut d: SaSint,
4876    cache: &mut [ThreadCache],
4877    block_start: FastSint,
4878    block_size: FastSint,
4879    threads: SaSint,
4880) -> SaSint {
4881    if block_size <= 0 {
4882        return d;
4883    }
4884    if threads == 1 || block_size < 16_384 {
4885        return partial_sorting_scan_right_to_left_32s_4k(
4886            t,
4887            sa,
4888            k,
4889            buckets,
4890            d,
4891            block_start,
4892            block_size,
4893        );
4894    }
4895
4896    let threads_usize = usize::try_from(threads)
4897        .expect("threads must be non-negative")
4898        .max(1);
4899    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4900    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4901    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4902
4903    for omp_thread_num in 0..omp_num_threads {
4904        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4905            omp_block_stride
4906        } else {
4907            block_size_usize - omp_thread_num * omp_block_stride
4908        };
4909        let omp_block_start = usize::try_from(block_start)
4910            .expect("block_start must be non-negative")
4911            + omp_thread_num * omp_block_stride;
4912        if omp_block_size > 0 {
4913            partial_sorting_scan_right_to_left_32s_4k_block_gather(
4914                t,
4915                sa,
4916                &mut cache[omp_thread_num * omp_block_stride
4917                    ..omp_thread_num * omp_block_stride + omp_block_size],
4918                omp_block_start as FastSint,
4919                omp_block_size as FastSint,
4920            );
4921        }
4922    }
4923
4924    d = partial_sorting_scan_right_to_left_32s_4k_block_sort(
4925        t,
4926        k,
4927        buckets,
4928        d,
4929        &mut cache[..block_size_usize],
4930        block_start,
4931        block_size,
4932    );
4933
4934    for omp_thread_num in 0..omp_num_threads {
4935        let omp_block_start = omp_thread_num * omp_block_stride;
4936        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4937            omp_block_stride
4938        } else {
4939            block_size_usize - omp_block_start
4940        };
4941        if omp_block_size > 0 {
4942            compact_and_place_cached_suffixes(
4943                sa,
4944                &mut cache[omp_block_start..],
4945                0,
4946                omp_block_size as FastSint,
4947            );
4948        }
4949    }
4950
4951    d
4952}
4953
4954/// Internal helper: partial sorting scan right to left 32s 1k block (OpenMP variant).
4955#[doc(hidden)]
4956pub fn partial_sorting_scan_right_to_left_32s_1k_block_omp(
4957    t: &[SaSint],
4958    sa: &mut [SaSint],
4959    buckets: &mut [SaSint],
4960    cache: &mut [ThreadCache],
4961    block_start: FastSint,
4962    block_size: FastSint,
4963    threads: SaSint,
4964) {
4965    if block_size <= 0 {
4966        return;
4967    }
4968    if threads == 1 || block_size < 16_384 {
4969        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, block_start, block_size);
4970        return;
4971    }
4972
4973    let threads_usize = usize::try_from(threads)
4974        .expect("threads must be non-negative")
4975        .max(1);
4976    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4977    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
4978    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4979    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4980
4981    for omp_thread_num in 0..omp_num_threads {
4982        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4983            omp_block_stride
4984        } else {
4985            block_size_usize - omp_thread_num * omp_block_stride
4986        };
4987        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
4988        if omp_block_size > 0 {
4989            partial_sorting_scan_right_to_left_32s_1k_block_gather(
4990                t,
4991                sa,
4992                &mut cache[omp_thread_num * omp_block_stride
4993                    ..omp_thread_num * omp_block_stride + omp_block_size],
4994                omp_block_start as FastSint,
4995                omp_block_size as FastSint,
4996            );
4997        }
4998    }
4999
5000    let cache = &mut cache[..block_size_usize];
5001    partial_sorting_scan_right_to_left_32s_1k_block_sort(
5002        t,
5003        buckets,
5004        cache,
5005        block_start,
5006        block_size,
5007    );
5008    for omp_thread_num in 0..omp_num_threads {
5009        let omp_block_start = omp_thread_num * omp_block_stride;
5010        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5011            omp_block_stride
5012        } else {
5013            block_size_usize - omp_block_start
5014        };
5015        if omp_block_size > 0 {
5016            compact_and_place_cached_suffixes(
5017                sa,
5018                &mut cache[omp_block_start..],
5019                0,
5020                omp_block_size as FastSint,
5021            );
5022        }
5023    }
5024}
5025
5026/// Internal helper: partial sorting scan left to right 32s 6k block gather.
5027#[doc(hidden)]
5028pub fn partial_sorting_scan_left_to_right_32s_6k_block_gather(
5029    t: &[SaSint],
5030    sa: &mut [SaSint],
5031    cache: &mut [ThreadCache],
5032    omp_block_start: FastSint,
5033    omp_block_size: FastSint,
5034) {
5035    if omp_block_size <= 0 {
5036        return;
5037    }
5038
5039    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5040    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5041    for offset in 0..size {
5042        let i = start + offset;
5043        let p = sa[i];
5044        cache[offset].index = p;
5045        let q = p & SAINT_MAX;
5046        cache[offset].symbol = if q != 0 {
5047            buckets_index4(
5048                usize::try_from(t[q as usize - 1]).expect("bucket symbol must be non-negative"),
5049                usize::from(t[q as usize - 2] >= t[q as usize - 1]),
5050            ) as SaSint
5051        } else {
5052            0
5053        };
5054    }
5055}
5056
5057/// Internal helper: partial sorting scan left to right 32s 4k block gather.
5058#[doc(hidden)]
5059pub fn partial_sorting_scan_left_to_right_32s_4k_block_gather(
5060    t: &[SaSint],
5061    sa: &mut [SaSint],
5062    cache: &mut [ThreadCache],
5063    omp_block_start: FastSint,
5064    omp_block_size: FastSint,
5065) {
5066    if omp_block_size <= 0 {
5067        return;
5068    }
5069
5070    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5071    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5072    for offset in 0..size {
5073        let i = start + offset;
5074        let mut symbol = SAINT_MIN;
5075        let mut p = sa[i];
5076        if p > 0 {
5077            cache[offset].index = p;
5078            p &= !SUFFIX_GROUP_MARKER;
5079            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
5080            symbol = buckets_index2(
5081                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
5082                usize::from(t[p_usize - 2] < t[p_usize - 1]),
5083            ) as SaSint;
5084            p = 0;
5085        }
5086        cache[offset].symbol = symbol;
5087        sa[i] = p & SAINT_MAX;
5088    }
5089}
5090
5091/// Internal helper: partial sorting scan left to right 32s 1k block gather.
5092#[doc(hidden)]
5093pub fn partial_sorting_scan_left_to_right_32s_1k_block_gather(
5094    t: &[SaSint],
5095    sa: &mut [SaSint],
5096    cache: &mut [ThreadCache],
5097    omp_block_start: FastSint,
5098    omp_block_size: FastSint,
5099) {
5100    if omp_block_size <= 0 {
5101        return;
5102    }
5103    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5104    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5105    for offset in 0..size {
5106        let i = start + offset;
5107        let mut symbol = SAINT_MIN;
5108        let mut p = sa[i];
5109        if p > 0 {
5110            cache[offset].index = (p - 1)
5111                | ((usize::from(t[p as usize - 2] < t[p as usize - 1]) as SaSint)
5112                    << (SAINT_BIT - 1));
5113            symbol = t[p as usize - 1];
5114            p = 0;
5115        }
5116        cache[offset].symbol = symbol;
5117        sa[i] = p & SAINT_MAX;
5118    }
5119}
5120
5121/// Internal helper: partial sorting scan left to right 32s 6k block sort.
5122#[doc(hidden)]
5123pub fn partial_sorting_scan_left_to_right_32s_6k_block_sort(
5124    t: &[SaSint],
5125    buckets: &mut [SaSint],
5126    mut d: SaSint,
5127    cache: &mut [ThreadCache],
5128    omp_block_start: FastSint,
5129    omp_block_size: FastSint,
5130) -> SaSint {
5131    if omp_block_size <= 0 {
5132        return d;
5133    }
5134
5135    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5136    let block_end =
5137        start + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5138
5139    let mut i = start;
5140    let mut j = block_end.saturating_sub(65);
5141    while i < j {
5142        let cache_i0 = i - start;
5143        let cache_i1 = cache_i0 + 1;
5144
5145        let v0 =
5146            usize::try_from(cache[cache_i0].symbol).expect("cache symbol must be non-negative");
5147        let p0 = cache[cache_i0].index;
5148        d += SaSint::from(p0 < 0);
5149        cache[cache_i0].symbol = buckets[v0];
5150        buckets[v0] += 1;
5151        cache[cache_i0].index =
5152            (p0 - 1) | ((SaSint::from(buckets[2 + v0] != d)) << (SAINT_BIT - 1));
5153        buckets[2 + v0] = d;
5154        if cache[cache_i0].symbol >= omp_block_start as SaSint
5155            && cache[cache_i0].symbol < block_end as SaSint
5156        {
5157            let s = usize::try_from(cache[cache_i0].symbol - omp_block_start as SaSint)
5158                .expect("cache slot must be non-negative");
5159            let q = cache[cache_i0].index & SAINT_MAX;
5160            cache[s].index = cache[cache_i0].index;
5161            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5162            cache[s].symbol = buckets_index4(
5163                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5164                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5165            ) as SaSint;
5166        }
5167
5168        let v1 =
5169            usize::try_from(cache[cache_i1].symbol).expect("cache symbol must be non-negative");
5170        let p1 = cache[cache_i1].index;
5171        d += SaSint::from(p1 < 0);
5172        cache[cache_i1].symbol = buckets[v1];
5173        buckets[v1] += 1;
5174        cache[cache_i1].index =
5175            (p1 - 1) | ((SaSint::from(buckets[2 + v1] != d)) << (SAINT_BIT - 1));
5176        buckets[2 + v1] = d;
5177        if cache[cache_i1].symbol >= omp_block_start as SaSint
5178            && cache[cache_i1].symbol < block_end as SaSint
5179        {
5180            let s = usize::try_from(cache[cache_i1].symbol - omp_block_start as SaSint)
5181                .expect("cache slot must be non-negative");
5182            let q = cache[cache_i1].index & SAINT_MAX;
5183            cache[s].index = cache[cache_i1].index;
5184            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5185            cache[s].symbol = buckets_index4(
5186                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5187                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5188            ) as SaSint;
5189        }
5190
5191        i += 2;
5192    }
5193
5194    j += 65;
5195    while i < j {
5196        let cache_i = i - start;
5197        let v = usize::try_from(cache[cache_i].symbol).expect("cache symbol must be non-negative");
5198        let p = cache[cache_i].index;
5199        d += SaSint::from(p < 0);
5200        cache[cache_i].symbol = buckets[v];
5201        buckets[v] += 1;
5202        cache[cache_i].index = (p - 1) | ((SaSint::from(buckets[2 + v] != d)) << (SAINT_BIT - 1));
5203        buckets[2 + v] = d;
5204        if cache[cache_i].symbol >= omp_block_start as SaSint
5205            && cache[cache_i].symbol < block_end as SaSint
5206        {
5207            let s = usize::try_from(cache[cache_i].symbol - omp_block_start as SaSint)
5208                .expect("cache slot must be non-negative");
5209            let q = cache[cache_i].index & SAINT_MAX;
5210            cache[s].index = cache[cache_i].index;
5211            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5212            cache[s].symbol = buckets_index4(
5213                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5214                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5215            ) as SaSint;
5216        }
5217        i += 1;
5218    }
5219
5220    d
5221}
5222
5223/// Internal helper: partial sorting scan left to right 32s 4k block sort.
5224#[doc(hidden)]
5225pub fn partial_sorting_scan_left_to_right_32s_4k_block_sort(
5226    t: &[SaSint],
5227    k: SaSint,
5228    buckets: &mut [SaSint],
5229    mut d: SaSint,
5230    cache: &mut [ThreadCache],
5231    omp_block_start: FastSint,
5232    omp_block_size: FastSint,
5233) -> SaSint {
5234    if omp_block_size <= 0 {
5235        return d;
5236    }
5237
5238    let k_usize = usize::try_from(k).expect("k must be non-negative");
5239    let (distinct_names, tail) = buckets.split_at_mut(2 * k_usize);
5240    let induction_bucket = &mut tail[..k_usize];
5241
5242    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5243    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5244    let block_end = start + size;
5245
5246    for offset in 0..size {
5247        let v = cache[offset].symbol;
5248        if v >= 0 {
5249            let p = cache[offset].index;
5250            d += p >> (SUFFIX_GROUP_BIT - 1);
5251
5252            let bucket_index = usize::try_from(v >> 1).expect("bucket index must be non-negative");
5253            let v_usize = usize::try_from(v).expect("cache symbol must be non-negative");
5254            let target = induction_bucket[bucket_index];
5255            induction_bucket[bucket_index] += 1;
5256
5257            cache[offset].symbol = target;
5258            cache[offset].index = (p - 1)
5259                | ((v & 1) << (SAINT_BIT - 1))
5260                | (((distinct_names[v_usize] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
5261            distinct_names[v_usize] = d;
5262
5263            if target >= omp_block_start as SaSint && target < block_end as SaSint {
5264                let ni = usize::try_from(target - omp_block_start as SaSint)
5265                    .expect("cache slot must be non-negative");
5266                let mut np = cache[offset].index;
5267                if np > 0 {
5268                    cache[ni].index = np;
5269                    np &= !SUFFIX_GROUP_MARKER;
5270                    let np_usize = usize::try_from(np).expect("suffix index must be non-negative");
5271                    cache[ni].symbol = buckets_index2(
5272                        usize::try_from(t[np_usize - 1])
5273                            .expect("bucket symbol must be non-negative"),
5274                        usize::from(t[np_usize - 2] < t[np_usize - 1]),
5275                    ) as SaSint;
5276                    np = 0;
5277                }
5278                cache[offset].index = np & SAINT_MAX;
5279            }
5280        }
5281    }
5282
5283    d
5284}
5285
5286/// Internal helper: partial sorting scan left to right 32s 1k block sort.
5287#[doc(hidden)]
5288pub fn partial_sorting_scan_left_to_right_32s_1k_block_sort(
5289    t: &[SaSint],
5290    induction_bucket: &mut [SaSint],
5291    cache: &mut [ThreadCache],
5292    omp_block_start: FastSint,
5293    omp_block_size: FastSint,
5294) {
5295    if omp_block_size <= 0 {
5296        return;
5297    }
5298    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5299    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5300    let block_end = start + size;
5301
5302    for offset in 0..size {
5303        let v = cache[offset].symbol;
5304        if v >= 0 {
5305            let v_usize = v as usize;
5306            let target = induction_bucket[v_usize];
5307            cache[offset].symbol = target;
5308            induction_bucket[v_usize] += 1;
5309            if target >= omp_block_start as SaSint && target < block_end as SaSint {
5310                let ni = usize::try_from(target - omp_block_start as SaSint)
5311                    .expect("cache slot must be non-negative");
5312                let mut np = cache[offset].index;
5313                if np > 0 {
5314                    cache[ni].index = (np - 1)
5315                        | ((usize::from(t[np as usize - 2] < t[np as usize - 1]) as SaSint)
5316                            << (SAINT_BIT - 1));
5317                    cache[ni].symbol = t[np as usize - 1];
5318                    np = 0;
5319                }
5320                cache[offset].index = np & SAINT_MAX;
5321            }
5322        }
5323    }
5324}
5325
5326/// Internal helper: partial sorting scan left to right 32s 6k block (OpenMP variant).
5327#[doc(hidden)]
5328pub fn partial_sorting_scan_left_to_right_32s_6k_block_omp(
5329    t: &[SaSint],
5330    sa: &mut [SaSint],
5331    buckets: &mut [SaSint],
5332    d: SaSint,
5333    cache: &mut [ThreadCache],
5334    block_start: FastSint,
5335    block_size: FastSint,
5336    threads: SaSint,
5337) -> SaSint {
5338    if block_size <= 0 {
5339        return d;
5340    }
5341    if threads == 1 || block_size < 16_384 {
5342        return partial_sorting_scan_left_to_right_32s_6k(
5343            t,
5344            sa,
5345            buckets,
5346            d,
5347            block_start,
5348            block_size,
5349        );
5350    }
5351
5352    let threads_usize = usize::try_from(threads)
5353        .expect("threads must be non-negative")
5354        .max(1);
5355    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5356    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5357    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5358    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5359
5360    for omp_thread_num in 0..omp_num_threads {
5361        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5362            omp_block_stride
5363        } else {
5364            block_size_usize - omp_thread_num * omp_block_stride
5365        };
5366        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5367        if omp_block_size > 0 {
5368            partial_sorting_scan_left_to_right_32s_6k_block_gather(
5369                t,
5370                sa,
5371                &mut cache[omp_thread_num * omp_block_stride
5372                    ..omp_thread_num * omp_block_stride + omp_block_size],
5373                omp_block_start as FastSint,
5374                omp_block_size as FastSint,
5375            );
5376        }
5377    }
5378
5379    let d = partial_sorting_scan_left_to_right_32s_6k_block_sort(
5380        t,
5381        buckets,
5382        d,
5383        &mut cache[..block_size_usize],
5384        block_start,
5385        block_size,
5386    );
5387
5388    for omp_thread_num in 0..omp_num_threads {
5389        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5390            omp_block_stride
5391        } else {
5392            block_size_usize - omp_thread_num * omp_block_stride
5393        };
5394        if omp_block_size > 0 {
5395            place_cached_suffixes(
5396                sa,
5397                &cache[omp_thread_num * omp_block_stride..],
5398                0,
5399                omp_block_size as FastSint,
5400            );
5401        }
5402    }
5403    d
5404}
5405
5406/// Internal helper: partial sorting scan left to right 32s 4k block (OpenMP variant).
5407#[doc(hidden)]
5408pub fn partial_sorting_scan_left_to_right_32s_4k_block_omp(
5409    t: &[SaSint],
5410    sa: &mut [SaSint],
5411    k: SaSint,
5412    buckets: &mut [SaSint],
5413    d: SaSint,
5414    cache: &mut [ThreadCache],
5415    block_start: FastSint,
5416    block_size: FastSint,
5417    threads: SaSint,
5418) -> SaSint {
5419    if block_size <= 0 {
5420        return d;
5421    }
5422    if threads == 1 || block_size < 16_384 {
5423        return partial_sorting_scan_left_to_right_32s_4k(
5424            t,
5425            sa,
5426            k,
5427            buckets,
5428            d,
5429            block_start,
5430            block_size,
5431        );
5432    }
5433
5434    let threads_usize = usize::try_from(threads)
5435        .expect("threads must be non-negative")
5436        .max(1);
5437    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5438    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5439    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5440    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5441
5442    for omp_thread_num in 0..omp_num_threads {
5443        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5444            omp_block_stride
5445        } else {
5446            block_size_usize - omp_thread_num * omp_block_stride
5447        };
5448        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5449        if omp_block_size > 0 {
5450            partial_sorting_scan_left_to_right_32s_4k_block_gather(
5451                t,
5452                sa,
5453                &mut cache[omp_thread_num * omp_block_stride
5454                    ..omp_thread_num * omp_block_stride + omp_block_size],
5455                omp_block_start as FastSint,
5456                omp_block_size as FastSint,
5457            );
5458        }
5459    }
5460
5461    let cache = &mut cache[..block_size_usize];
5462    let d = partial_sorting_scan_left_to_right_32s_4k_block_sort(
5463        t,
5464        k,
5465        buckets,
5466        d,
5467        cache,
5468        block_start,
5469        block_size,
5470    );
5471
5472    for omp_thread_num in 0..omp_num_threads {
5473        let omp_block_start = omp_thread_num * omp_block_stride;
5474        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5475            omp_block_stride
5476        } else {
5477            block_size_usize - omp_block_start
5478        };
5479        if omp_block_size > 0 {
5480            compact_and_place_cached_suffixes(
5481                sa,
5482                &mut cache[omp_block_start..],
5483                0,
5484                omp_block_size as FastSint,
5485            );
5486        }
5487    }
5488
5489    d
5490}
5491
5492/// Internal helper: partial sorting scan left to right 32s 1k block (OpenMP variant).
5493#[doc(hidden)]
5494pub fn partial_sorting_scan_left_to_right_32s_1k_block_omp(
5495    t: &[SaSint],
5496    sa: &mut [SaSint],
5497    buckets: &mut [SaSint],
5498    cache: &mut [ThreadCache],
5499    block_start: FastSint,
5500    block_size: FastSint,
5501    threads: SaSint,
5502) {
5503    if block_size <= 0 {
5504        return;
5505    }
5506    if threads == 1 || block_size < 16_384 {
5507        partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, block_start, block_size);
5508        return;
5509    }
5510
5511    let threads_usize = usize::try_from(threads)
5512        .expect("threads must be non-negative")
5513        .max(1);
5514    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5515    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5516    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5517    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5518
5519    for omp_thread_num in 0..omp_num_threads {
5520        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5521            omp_block_stride
5522        } else {
5523            block_size_usize - omp_thread_num * omp_block_stride
5524        };
5525        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5526        if omp_block_size > 0 {
5527            partial_sorting_scan_left_to_right_32s_1k_block_gather(
5528                t,
5529                sa,
5530                &mut cache[omp_thread_num * omp_block_stride
5531                    ..omp_thread_num * omp_block_stride + omp_block_size],
5532                omp_block_start as FastSint,
5533                omp_block_size as FastSint,
5534            );
5535        }
5536    }
5537
5538    let cache = &mut cache[..block_size_usize];
5539    partial_sorting_scan_left_to_right_32s_1k_block_sort(
5540        t,
5541        buckets,
5542        cache,
5543        block_start,
5544        block_size,
5545    );
5546    for omp_thread_num in 0..omp_num_threads {
5547        let omp_block_start = omp_thread_num * omp_block_stride;
5548        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5549            omp_block_stride
5550        } else {
5551            block_size_usize - omp_block_start
5552        };
5553        if omp_block_size > 0 {
5554            compact_and_place_cached_suffixes(
5555                sa,
5556                &mut cache[omp_block_start..],
5557                0,
5558                omp_block_size as FastSint,
5559            );
5560        }
5561    }
5562}
5563
5564/// Internal helper: partial sorting scan right to left 32s 6k (OpenMP variant).
5565#[doc(hidden)]
5566pub fn partial_sorting_scan_right_to_left_32s_6k_omp(
5567    t: &[SaSint],
5568    sa: &mut [SaSint],
5569    n: SaSint,
5570    buckets: &mut [SaSint],
5571    first_lms_suffix: SaSint,
5572    left_suffixes_count: SaSint,
5573    mut d: SaSint,
5574    threads: SaSint,
5575    thread_state: &mut [ThreadState],
5576) -> SaSint {
5577    let scan_start = left_suffixes_count as FastSint + 1;
5578    let scan_end = n as FastSint - first_lms_suffix as FastSint;
5579    if threads == 1 || (scan_end - scan_start) < 65_536 {
5580        return partial_sorting_scan_right_to_left_32s_6k(
5581            t,
5582            sa,
5583            buckets,
5584            d,
5585            scan_start,
5586            scan_end - scan_start,
5587        );
5588    }
5589    if thread_state.is_empty() {
5590        return partial_sorting_scan_right_to_left_32s_6k(
5591            t,
5592            sa,
5593            buckets,
5594            d,
5595            scan_start,
5596            scan_end - scan_start,
5597        );
5598    }
5599
5600    let threads_usize = usize::try_from(threads)
5601        .expect("threads must be non-negative")
5602        .max(1);
5603    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5604    let mut block_start = scan_end - 1;
5605    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5606        .expect("block span must fit FastSint");
5607    while block_start >= scan_start {
5608        let mut block_end = block_start - block_span;
5609        if block_end < scan_start {
5610            block_end = scan_start - 1;
5611        }
5612
5613        d = partial_sorting_scan_right_to_left_32s_6k_block_omp(
5614            t,
5615            sa,
5616            buckets,
5617            d,
5618            &mut cache,
5619            block_end + 1,
5620            block_start - block_end,
5621            threads,
5622        );
5623
5624        if block_end < scan_start {
5625            break;
5626        }
5627        block_start = block_end;
5628    }
5629
5630    d
5631}
5632
5633/// Internal helper: partial sorting scan right to left 32s 4k (OpenMP variant).
5634#[doc(hidden)]
5635pub fn partial_sorting_scan_right_to_left_32s_4k_omp(
5636    t: &[SaSint],
5637    sa: &mut [SaSint],
5638    n: SaSint,
5639    k: SaSint,
5640    buckets: &mut [SaSint],
5641    mut d: SaSint,
5642    threads: SaSint,
5643    thread_state: &mut [ThreadState],
5644) -> SaSint {
5645    if threads == 1 || n < 65_536 {
5646        return partial_sorting_scan_right_to_left_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
5647    }
5648    if thread_state.is_empty() {
5649        return partial_sorting_scan_right_to_left_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
5650    }
5651    let threads_usize = usize::try_from(threads)
5652        .expect("threads must be non-negative")
5653        .max(1);
5654    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5655    let mut block_start = FastSint::try_from(n).expect("n must fit FastSint") - 1;
5656    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5657        .expect("block span must fit FastSint");
5658    while block_start >= 0 {
5659        let mut block_end = block_start - block_span;
5660        if block_end < 0 {
5661            block_end = -1;
5662        }
5663
5664        d = partial_sorting_scan_right_to_left_32s_4k_block_omp(
5665            t,
5666            sa,
5667            k,
5668            buckets,
5669            d,
5670            &mut cache,
5671            block_end + 1,
5672            block_start - block_end,
5673            threads,
5674        );
5675
5676        if block_end < 0 {
5677            break;
5678        }
5679        block_start = block_end;
5680    }
5681
5682    d
5683}
5684
5685/// Internal helper: partial sorting scan right to left 32s 1k (OpenMP variant).
5686#[doc(hidden)]
5687pub fn partial_sorting_scan_right_to_left_32s_1k_omp(
5688    t: &[SaSint],
5689    sa: &mut [SaSint],
5690    n: SaSint,
5691    buckets: &mut [SaSint],
5692    threads: SaSint,
5693    thread_state: &mut [ThreadState],
5694) {
5695    if threads == 1 || n < 65_536 {
5696        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, 0, n as FastSint);
5697        return;
5698    }
5699    if thread_state.is_empty() {
5700        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, 0, n as FastSint);
5701        return;
5702    }
5703
5704    let threads_usize = usize::try_from(threads)
5705        .expect("threads must be non-negative")
5706        .max(1);
5707    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5708    let mut block_start = FastSint::try_from(n).expect("n must fit FastSint") - 1;
5709    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5710        .expect("block span must fit FastSint");
5711    while block_start >= 0 {
5712        let mut block_end = block_start - block_span;
5713        if block_end < 0 {
5714            block_end = -1;
5715        }
5716
5717        partial_sorting_scan_right_to_left_32s_1k_block_omp(
5718            t,
5719            sa,
5720            buckets,
5721            &mut cache,
5722            block_end + 1,
5723            block_start - block_end,
5724            threads,
5725        );
5726
5727        if block_end < 0 {
5728            break;
5729        }
5730        block_start = block_end;
5731    }
5732}
5733
5734/// Internal helper: partial sorting gather lms suffixes 32s 4k.
5735#[doc(hidden)]
5736pub fn partial_sorting_gather_lms_suffixes_32s_4k(
5737    sa: &mut [SaSint],
5738    omp_block_start: FastSint,
5739    omp_block_size: FastSint,
5740) -> FastSint {
5741    if omp_block_size <= 0 {
5742        return omp_block_start;
5743    }
5744
5745    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5746    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5747    let mut l = start;
5748
5749    for i in start..start + size {
5750        let s = sa[i] as SaUint;
5751        sa[l] = ((s.wrapping_sub(SUFFIX_GROUP_MARKER as SaUint)) & !(SUFFIX_GROUP_MARKER as SaUint))
5752            as SaSint;
5753        l += usize::from((s as SaSint) < 0);
5754    }
5755
5756    l as FastSint
5757}
5758
5759/// Internal helper: partial sorting gather lms suffixes 32s 1k.
5760#[doc(hidden)]
5761pub fn partial_sorting_gather_lms_suffixes_32s_1k(
5762    sa: &mut [SaSint],
5763    omp_block_start: FastSint,
5764    omp_block_size: FastSint,
5765) -> FastSint {
5766    if omp_block_size <= 0 {
5767        return omp_block_start;
5768    }
5769
5770    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5771    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5772    let mut l = start;
5773
5774    for i in start..start + size {
5775        let s = sa[i];
5776        sa[l] = s & SAINT_MAX;
5777        l += usize::from(s < 0);
5778    }
5779
5780    l as FastSint
5781}
5782
5783/// Internal helper: partial sorting gather lms suffixes 32s 4k (OpenMP variant).
5784#[doc(hidden)]
5785pub fn partial_sorting_gather_lms_suffixes_32s_4k_omp(
5786    sa: &mut [SaSint],
5787    n: SaSint,
5788    threads: SaSint,
5789    thread_state: &mut [ThreadState],
5790) {
5791    let n_usize = usize::try_from(n).expect("n must be non-negative");
5792    let omp_num_threads = if threads > 1 && n >= 65_536 {
5793        usize::try_from(threads)
5794            .expect("threads must be non-negative")
5795            .min(thread_state.len())
5796            .max(1)
5797    } else {
5798        1
5799    };
5800
5801    if omp_num_threads == 1 {
5802        let _ = partial_sorting_gather_lms_suffixes_32s_4k(sa, 0, n as FastSint);
5803        return;
5804    }
5805
5806    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
5807    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
5808        let block_start = thread_num * omp_block_stride;
5809        let block_size = if thread_num + 1 < omp_num_threads {
5810            omp_block_stride
5811        } else {
5812            n_usize - block_start
5813        };
5814        state.position = block_start as FastSint;
5815        state.count = partial_sorting_gather_lms_suffixes_32s_4k(
5816            sa,
5817            block_start as FastSint,
5818            block_size as FastSint,
5819        ) - block_start as FastSint;
5820    }
5821
5822    let mut position = 0usize;
5823    for (thread_num, state) in thread_state.iter().take(omp_num_threads).enumerate() {
5824        let count = usize::try_from(state.count).expect("count must be non-negative");
5825        let src = usize::try_from(state.position).expect("position must be non-negative");
5826        if thread_num > 0 && count > 0 {
5827            sa.copy_within(src..src + count, position);
5828        }
5829        position += count;
5830    }
5831}
5832
5833/// Internal helper: partial sorting gather lms suffixes 32s 1k (OpenMP variant).
5834#[doc(hidden)]
5835pub fn partial_sorting_gather_lms_suffixes_32s_1k_omp(
5836    sa: &mut [SaSint],
5837    n: SaSint,
5838    threads: SaSint,
5839    thread_state: &mut [ThreadState],
5840) {
5841    let n_usize = usize::try_from(n).expect("n must be non-negative");
5842    let omp_num_threads = if threads > 1 && n >= 65_536 {
5843        usize::try_from(threads)
5844            .expect("threads must be non-negative")
5845            .min(thread_state.len())
5846            .max(1)
5847    } else {
5848        1
5849    };
5850
5851    if omp_num_threads == 1 {
5852        let _ = partial_sorting_gather_lms_suffixes_32s_1k(sa, 0, n as FastSint);
5853        return;
5854    }
5855
5856    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
5857    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
5858        let block_start = thread_num * omp_block_stride;
5859        let block_size = if thread_num + 1 < omp_num_threads {
5860            omp_block_stride
5861        } else {
5862            n_usize - block_start
5863        };
5864        state.position = block_start as FastSint;
5865        state.count = partial_sorting_gather_lms_suffixes_32s_1k(
5866            sa,
5867            block_start as FastSint,
5868            block_size as FastSint,
5869        ) - block_start as FastSint;
5870    }
5871
5872    let mut position = 0usize;
5873    for (thread_num, state) in thread_state.iter().take(omp_num_threads).enumerate() {
5874        let count = usize::try_from(state.count).expect("count must be non-negative");
5875        let src = usize::try_from(state.position).expect("position must be non-negative");
5876        if thread_num > 0 && count > 0 {
5877            sa.copy_within(src..src + count, position);
5878        }
5879        position += count;
5880    }
5881}
5882
5883/// Internal helper: induce partial order 8u (OpenMP variant).
5884#[doc(hidden)]
5885pub fn induce_partial_order_8u_omp(
5886    t: &[u8],
5887    sa: &mut [SaSint],
5888    n: SaSint,
5889    k: SaSint,
5890    flags: SaSint,
5891    buckets: &mut [SaSint],
5892    first_lms_suffix: SaSint,
5893    left_suffixes_count: SaSint,
5894    threads: SaSint,
5895    thread_state: &mut [ThreadState],
5896) {
5897    buckets[2 * ALPHABET_SIZE..4 * ALPHABET_SIZE].fill(0);
5898
5899    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
5900        let left = 4 * ALPHABET_SIZE + buckets_index2(0, 1);
5901        let right = 4 * ALPHABET_SIZE + buckets_index2(1, 1);
5902        buckets[left] = buckets[right] - 1;
5903        flip_suffix_markers_omp(sa, buckets[left], threads);
5904    }
5905
5906    let d = partial_sorting_scan_left_to_right_8u_omp(
5907        t,
5908        sa,
5909        n,
5910        k,
5911        buckets,
5912        left_suffixes_count,
5913        0,
5914        threads,
5915        thread_state,
5916    );
5917    partial_sorting_shift_markers_8u_omp(sa, n, buckets, threads);
5918
5919    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
5920        partial_gsa_scan_right_to_left_8u_omp(
5921            t,
5922            sa,
5923            n,
5924            k,
5925            buckets,
5926            first_lms_suffix,
5927            left_suffixes_count,
5928            d,
5929            threads,
5930            thread_state,
5931        );
5932
5933        if t[usize::try_from(first_lms_suffix).expect("first_lms_suffix must be non-negative")] == 0
5934        {
5935            let count = usize::try_from(buckets[buckets_index2(1, 1)] - 1)
5936                .expect("count must be non-negative");
5937            sa.copy_within(0..count, 1);
5938            sa[0] = first_lms_suffix | SAINT_MIN;
5939        }
5940
5941        buckets[buckets_index2(0, 1)] = 0;
5942    } else {
5943        partial_sorting_scan_right_to_left_8u_omp(
5944            t,
5945            sa,
5946            n,
5947            k,
5948            buckets,
5949            first_lms_suffix,
5950            left_suffixes_count,
5951            d,
5952            threads,
5953            thread_state,
5954        );
5955    }
5956}
5957
5958/// Internal helper: induce partial order 32s 6k (OpenMP variant).
5959#[doc(hidden)]
5960pub fn induce_partial_order_32s_6k_omp(
5961    t: &[SaSint],
5962    sa: &mut [SaSint],
5963    n: SaSint,
5964    k: SaSint,
5965    buckets: &mut [SaSint],
5966    first_lms_suffix: SaSint,
5967    left_suffixes_count: SaSint,
5968    threads: SaSint,
5969    thread_state: &mut [ThreadState],
5970) {
5971    let d = partial_sorting_scan_left_to_right_32s_6k_omp(
5972        t,
5973        sa,
5974        n,
5975        buckets,
5976        left_suffixes_count,
5977        0,
5978        threads,
5979        thread_state,
5980    );
5981    partial_sorting_shift_markers_32s_6k_omp(sa, k, buckets, threads);
5982    partial_sorting_shift_buckets_32s_6k(k, buckets);
5983    let _ = partial_sorting_scan_right_to_left_32s_6k_omp(
5984        t,
5985        sa,
5986        n,
5987        buckets,
5988        first_lms_suffix,
5989        left_suffixes_count,
5990        d,
5991        threads,
5992        thread_state,
5993    );
5994}
5995
5996/// Internal helper: induce partial order 32s 4k (OpenMP variant).
5997#[doc(hidden)]
5998pub fn induce_partial_order_32s_4k_omp(
5999    t: &[SaSint],
6000    sa: &mut [SaSint],
6001    n: SaSint,
6002    k: SaSint,
6003    buckets: &mut [SaSint],
6004    threads: SaSint,
6005    thread_state: &mut [ThreadState],
6006) {
6007    let zero_len = 2 * usize::try_from(k).expect("k must be non-negative");
6008    buckets[..zero_len].fill(0);
6009
6010    let d = partial_sorting_scan_left_to_right_32s_4k_omp(
6011        t,
6012        sa,
6013        n,
6014        k,
6015        buckets,
6016        0,
6017        threads,
6018        thread_state,
6019    );
6020    partial_sorting_shift_markers_32s_4k(sa, n);
6021    let _ = partial_sorting_scan_right_to_left_32s_4k_omp(
6022        t,
6023        sa,
6024        n,
6025        k,
6026        buckets,
6027        d,
6028        threads,
6029        thread_state,
6030    );
6031    partial_sorting_gather_lms_suffixes_32s_4k_omp(sa, n, threads, thread_state);
6032}
6033
6034/// Internal helper: induce partial order 32s 2k (OpenMP variant).
6035#[doc(hidden)]
6036pub fn induce_partial_order_32s_2k_omp(
6037    t: &[SaSint],
6038    sa: &mut [SaSint],
6039    n: SaSint,
6040    k: SaSint,
6041    buckets: &mut [SaSint],
6042    threads: SaSint,
6043    thread_state: &mut [ThreadState],
6044) {
6045    let k_usize = usize::try_from(k).expect("k must be non-negative");
6046    let (left, right) = buckets.split_at_mut(k_usize);
6047    partial_sorting_scan_left_to_right_32s_1k_omp(t, sa, n, right, threads, thread_state);
6048    partial_sorting_scan_right_to_left_32s_1k_omp(t, sa, n, left, threads, thread_state);
6049    partial_sorting_gather_lms_suffixes_32s_1k_omp(sa, n, threads, thread_state);
6050}
6051
6052/// Internal helper: induce partial order 32s 1k (OpenMP variant).
6053#[doc(hidden)]
6054pub fn induce_partial_order_32s_1k_omp(
6055    t: &[SaSint],
6056    sa: &mut [SaSint],
6057    n: SaSint,
6058    k: SaSint,
6059    buckets: &mut [SaSint],
6060    threads: SaSint,
6061    thread_state: &mut [ThreadState],
6062) {
6063    count_suffixes_32s(t, n, k, buckets);
6064    initialize_buckets_start_32s_1k(k, buckets);
6065    partial_sorting_scan_left_to_right_32s_1k_omp(t, sa, n, buckets, threads, thread_state);
6066
6067    count_suffixes_32s(t, n, k, buckets);
6068    initialize_buckets_end_32s_1k(k, buckets);
6069    partial_sorting_scan_right_to_left_32s_1k_omp(t, sa, n, buckets, threads, thread_state);
6070
6071    partial_sorting_gather_lms_suffixes_32s_1k_omp(sa, n, threads, thread_state);
6072}
6073
6074/// Internal helper: renumber lms suffixes 8u.
6075#[doc(hidden)]
6076pub fn renumber_lms_suffixes_8u(
6077    sa: &mut [SaSint],
6078    m: SaSint,
6079    mut name: SaSint,
6080    omp_block_start: FastSint,
6081    omp_block_size: FastSint,
6082) -> SaSint {
6083    if omp_block_size <= 0 {
6084        return name;
6085    }
6086
6087    let m_usize = usize::try_from(m).expect("m must be non-negative");
6088    let (sa_head, sam) = sa.split_at_mut(m_usize);
6089    let mut i = omp_block_start;
6090    let mut j = omp_block_start + omp_block_size - 64 - 3;
6091
6092    while i < j {
6093        let i0 = i as usize;
6094        let p0 = sa_head[i0];
6095        let d0 = ((p0 & SAINT_MAX) >> 1) as usize;
6096        sam[d0] = name | SAINT_MIN;
6097        name += SaSint::from(p0 < 0);
6098
6099        let p1 = sa_head[i0 + 1];
6100        let d1 = ((p1 & SAINT_MAX) >> 1) as usize;
6101        sam[d1] = name | SAINT_MIN;
6102        name += SaSint::from(p1 < 0);
6103
6104        let p2 = sa_head[i0 + 2];
6105        let d2 = ((p2 & SAINT_MAX) >> 1) as usize;
6106        sam[d2] = name | SAINT_MIN;
6107        name += SaSint::from(p2 < 0);
6108
6109        let p3 = sa_head[i0 + 3];
6110        let d3 = ((p3 & SAINT_MAX) >> 1) as usize;
6111        sam[d3] = name | SAINT_MIN;
6112        name += SaSint::from(p3 < 0);
6113
6114        i += 4;
6115    }
6116
6117    j += 64 + 3;
6118    while i < j {
6119        let p = sa_head[i as usize];
6120        let d = ((p & SAINT_MAX) >> 1) as usize;
6121        sam[d] = name | SAINT_MIN;
6122        name += SaSint::from(p < 0);
6123        i += 1;
6124    }
6125
6126    name
6127}
6128
6129/// Internal helper: gather marked lms suffixes.
6130#[doc(hidden)]
6131pub fn gather_marked_lms_suffixes(
6132    sa: &mut [SaSint],
6133    m: SaSint,
6134    l: FastSint,
6135    omp_block_start: FastSint,
6136    omp_block_size: FastSint,
6137) -> FastSint {
6138    if omp_block_size <= 0 {
6139        return l;
6140    }
6141
6142    let mut l = l - 1;
6143    let mut i = m as FastSint + omp_block_start + omp_block_size - 1;
6144    let mut j = m as FastSint + omp_block_start + 3;
6145
6146    while i >= j {
6147        let i0 = i as usize;
6148        let s0 = sa[i0];
6149        sa[l as usize] = s0 & SAINT_MAX;
6150        l -= FastSint::from(s0 < 0);
6151
6152        let s1 = sa[i0 - 1];
6153        sa[l as usize] = s1 & SAINT_MAX;
6154        l -= FastSint::from(s1 < 0);
6155
6156        let s2 = sa[i0 - 2];
6157        sa[l as usize] = s2 & SAINT_MAX;
6158        l -= FastSint::from(s2 < 0);
6159
6160        let s3 = sa[i0 - 3];
6161        sa[l as usize] = s3 & SAINT_MAX;
6162        l -= FastSint::from(s3 < 0);
6163
6164        i -= 4;
6165    }
6166
6167    j -= 3;
6168    while i >= j {
6169        let s = sa[i as usize];
6170        sa[l as usize] = s & SAINT_MAX;
6171        l -= FastSint::from(s < 0);
6172        i -= 1;
6173    }
6174
6175    l + 1
6176}
6177
6178/// Internal helper: renumber lms suffixes 8u (OpenMP variant).
6179#[doc(hidden)]
6180pub fn renumber_lms_suffixes_8u_omp(
6181    sa: &mut [SaSint],
6182    m: SaSint,
6183    threads: SaSint,
6184    thread_state: &mut [ThreadState],
6185) -> SaSint {
6186    let mut name = 0;
6187    let omp_num_threads = if threads > 1 && m >= 65_536 {
6188        usize::try_from(threads)
6189            .expect("threads must be non-negative")
6190            .min(thread_state.len())
6191            .max(1)
6192    } else {
6193        1
6194    };
6195    let omp_block_stride = (m as FastSint / omp_num_threads as FastSint) & !15;
6196
6197    if omp_num_threads == 1 {
6198        name = renumber_lms_suffixes_8u(sa, m, 0, 0, m as FastSint);
6199    } else {
6200        for omp_thread_num in 0..omp_num_threads {
6201            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6202            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6203                omp_block_stride
6204            } else {
6205                m as FastSint - omp_block_start
6206            };
6207            thread_state[omp_thread_num].count =
6208                count_negative_marked_suffixes(sa, omp_block_start, omp_block_size) as FastSint;
6209        }
6210
6211        for omp_thread_num in 0..omp_num_threads {
6212            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6213            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6214                omp_block_stride
6215            } else {
6216                m as FastSint - omp_block_start
6217            };
6218
6219            let mut count: FastSint = 0;
6220            for t in 0..omp_thread_num {
6221                count += thread_state[t].count;
6222            }
6223
6224            if omp_thread_num + 1 == omp_num_threads {
6225                name = (count + thread_state[omp_thread_num].count) as SaSint;
6226            }
6227
6228            let _ =
6229                renumber_lms_suffixes_8u(sa, m, count as SaSint, omp_block_start, omp_block_size);
6230        }
6231    }
6232
6233    name
6234}
6235
6236/// Internal helper: gather marked lms suffixes (OpenMP variant).
6237#[doc(hidden)]
6238pub fn gather_marked_lms_suffixes_omp(
6239    sa: &mut [SaSint],
6240    n: SaSint,
6241    m: SaSint,
6242    fs: SaSint,
6243    threads: SaSint,
6244    thread_state: &mut [ThreadState],
6245) {
6246    let n_fast = n as FastSint;
6247    let m_fast = m as FastSint;
6248    let omp_num_threads = if threads > 1 && n >= 131_072 {
6249        usize::try_from(threads)
6250            .expect("threads must be non-negative")
6251            .min(thread_state.len())
6252            .max(1)
6253    } else {
6254        1
6255    };
6256    let omp_block_stride = ((n_fast >> 1) / omp_num_threads as FastSint) & !15;
6257
6258    if omp_num_threads == 1 {
6259        let _ = gather_marked_lms_suffixes(sa, m, n_fast + fs as FastSint, 0, n_fast >> 1);
6260    } else {
6261        for omp_thread_num in 0..omp_num_threads {
6262            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6263            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6264                omp_block_stride
6265            } else {
6266                (n_fast >> 1) - omp_block_start
6267            };
6268
6269            if omp_thread_num < omp_num_threads - 1 {
6270                thread_state[omp_thread_num].position = gather_marked_lms_suffixes(
6271                    sa,
6272                    m,
6273                    m_fast + omp_block_start + omp_block_size,
6274                    omp_block_start,
6275                    omp_block_size,
6276                );
6277                thread_state[omp_thread_num].count = m_fast + omp_block_start + omp_block_size
6278                    - thread_state[omp_thread_num].position;
6279            } else {
6280                thread_state[omp_thread_num].position = gather_marked_lms_suffixes(
6281                    sa,
6282                    m,
6283                    n_fast + fs as FastSint,
6284                    omp_block_start,
6285                    omp_block_size,
6286                );
6287                thread_state[omp_thread_num].count =
6288                    n_fast + fs as FastSint - thread_state[omp_thread_num].position;
6289            }
6290        }
6291
6292        let mut position = n_fast + fs as FastSint;
6293        for t in (0..omp_num_threads).rev() {
6294            position -= thread_state[t].count;
6295            if t + 1 != omp_num_threads && thread_state[t].count > 0 {
6296                let src = usize::try_from(thread_state[t].position)
6297                    .expect("position must be non-negative");
6298                let len =
6299                    usize::try_from(thread_state[t].count).expect("count must be non-negative");
6300                let dst = usize::try_from(position).expect("position must be non-negative");
6301                sa.copy_within(src..src + len, dst);
6302            }
6303        }
6304    }
6305}
6306
6307/// Internal helper: renumber and gather lms suffixes (OpenMP variant).
6308#[doc(hidden)]
6309pub fn renumber_and_gather_lms_suffixes_omp(
6310    sa: &mut [SaSint],
6311    n: SaSint,
6312    m: SaSint,
6313    fs: SaSint,
6314    threads: SaSint,
6315    thread_state: &mut [ThreadState],
6316) -> SaSint {
6317    let m_usize = usize::try_from(m).expect("m must be non-negative");
6318    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6319    sa[m_usize..m_usize + half_n].fill(0);
6320
6321    let name = renumber_lms_suffixes_8u_omp(sa, m, threads, thread_state);
6322    if name < m {
6323        gather_marked_lms_suffixes_omp(sa, n, m, fs, threads, thread_state);
6324    } else {
6325        let mut i = 0;
6326        while i < m_usize {
6327            sa[i] &= SAINT_MAX;
6328            i += 1;
6329        }
6330    }
6331
6332    name
6333}
6334
6335/// Internal helper: renumber distinct lms suffixes 32s 4k.
6336#[doc(hidden)]
6337pub fn renumber_distinct_lms_suffixes_32s_4k(
6338    sa: &mut [SaSint],
6339    m: SaSint,
6340    mut name: SaSint,
6341    omp_block_start: FastSint,
6342    omp_block_size: FastSint,
6343) -> SaSint {
6344    if omp_block_size <= 0 {
6345        return name;
6346    }
6347
6348    let prefetch_distance = 64usize;
6349    let m_usize = usize::try_from(m).expect("m must be non-negative");
6350    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6351    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6352    let (sa_head, sam) = sa.split_at_mut(m_usize);
6353    let mut i = start;
6354    let mut j = start
6355        .saturating_add(size)
6356        .saturating_sub(prefetch_distance + 3);
6357    let mut p0;
6358    let mut p1;
6359    let mut p2;
6360    let mut p3 = 0;
6361
6362    while i < j {
6363        p0 = sa_head[i];
6364        sa_head[i] = p0 & SAINT_MAX;
6365        sam[(sa_head[i] >> 1) as usize] = name | (p0 & p3 & SAINT_MIN);
6366        name += SaSint::from(p0 < 0);
6367
6368        p1 = sa_head[i + 1];
6369        sa_head[i + 1] = p1 & SAINT_MAX;
6370        sam[(sa_head[i + 1] >> 1) as usize] = name | (p1 & p0 & SAINT_MIN);
6371        name += SaSint::from(p1 < 0);
6372
6373        p2 = sa_head[i + 2];
6374        sa_head[i + 2] = p2 & SAINT_MAX;
6375        sam[(sa_head[i + 2] >> 1) as usize] = name | (p2 & p1 & SAINT_MIN);
6376        name += SaSint::from(p2 < 0);
6377
6378        p3 = sa_head[i + 3];
6379        sa_head[i + 3] = p3 & SAINT_MAX;
6380        sam[(sa_head[i + 3] >> 1) as usize] = name | (p3 & p2 & SAINT_MIN);
6381        name += SaSint::from(p3 < 0);
6382
6383        i += 4;
6384    }
6385
6386    j = start + size;
6387    while i < j {
6388        p2 = p3;
6389        p3 = sa_head[i];
6390        sa_head[i] = p3 & SAINT_MAX;
6391        sam[(sa_head[i] >> 1) as usize] = name | (p3 & p2 & SAINT_MIN);
6392        name += SaSint::from(p3 < 0);
6393        i += 1;
6394    }
6395
6396    name
6397}
6398
6399/// Internal helper: mark distinct lms suffixes 32s.
6400#[doc(hidden)]
6401pub fn mark_distinct_lms_suffixes_32s(
6402    sa: &mut [SaSint],
6403    m: SaSint,
6404    omp_block_start: FastSint,
6405    omp_block_size: FastSint,
6406) {
6407    if omp_block_size <= 0 {
6408        return;
6409    }
6410
6411    let m_usize = usize::try_from(m).expect("m must be non-negative");
6412    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6413    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6414    let mut i = m_usize + start;
6415    let mut j = m_usize + start + size.saturating_sub(3);
6416    let mut p3 = 0;
6417
6418    while i < j {
6419        let mut p0 = sa[i];
6420        sa[i] = p0 & (p3 | SAINT_MAX);
6421        p0 = if p0 == 0 { p3 } else { p0 };
6422
6423        let mut p1 = sa[i + 1];
6424        sa[i + 1] = p1 & (p0 | SAINT_MAX);
6425        p1 = if p1 == 0 { p0 } else { p1 };
6426
6427        let mut p2 = sa[i + 2];
6428        sa[i + 2] = p2 & (p1 | SAINT_MAX);
6429        p2 = if p2 == 0 { p1 } else { p2 };
6430
6431        p3 = sa[i + 3];
6432        sa[i + 3] = p3 & (p2 | SAINT_MAX);
6433        p3 = if p3 == 0 { p2 } else { p3 };
6434
6435        i += 4;
6436    }
6437
6438    j = m_usize + start + size;
6439    while i < j {
6440        let p2 = p3;
6441        p3 = sa[i];
6442        sa[i] = p3 & (p2 | SAINT_MAX);
6443        p3 = if p3 == 0 { p2 } else { p3 };
6444        i += 1;
6445    }
6446}
6447
6448/// Internal helper: clamp lms suffixes length 32s.
6449#[doc(hidden)]
6450pub fn clamp_lms_suffixes_length_32s(
6451    sa: &mut [SaSint],
6452    m: SaSint,
6453    omp_block_start: FastSint,
6454    omp_block_size: FastSint,
6455) {
6456    if omp_block_size <= 0 {
6457        return;
6458    }
6459
6460    let m_usize = usize::try_from(m).expect("m must be non-negative");
6461    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6462    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6463    let mut i = m_usize + start;
6464    let mut j = m_usize + start + size.saturating_sub(3);
6465
6466    while i < j {
6467        let s0 = sa[i];
6468        sa[i] = if s0 < 0 { s0 } else { 0 } & SAINT_MAX;
6469
6470        let s1 = sa[i + 1];
6471        sa[i + 1] = if s1 < 0 { s1 } else { 0 } & SAINT_MAX;
6472
6473        let s2 = sa[i + 2];
6474        sa[i + 2] = if s2 < 0 { s2 } else { 0 } & SAINT_MAX;
6475
6476        let s3 = sa[i + 3];
6477        sa[i + 3] = if s3 < 0 { s3 } else { 0 } & SAINT_MAX;
6478
6479        i += 4;
6480    }
6481
6482    j = m_usize + start + size;
6483    while i < j {
6484        let s = sa[i];
6485        sa[i] = if s < 0 { s } else { 0 } & SAINT_MAX;
6486        i += 1;
6487    }
6488}
6489
6490/// Internal helper: renumber distinct lms suffixes 32s 4k (OpenMP variant).
6491#[doc(hidden)]
6492pub fn renumber_distinct_lms_suffixes_32s_4k_omp(
6493    sa: &mut [SaSint],
6494    m: SaSint,
6495    threads: SaSint,
6496    thread_state: &mut [ThreadState],
6497) -> SaSint {
6498    let mut name = 0;
6499    let m_usize = usize::try_from(m).expect("m must be non-negative");
6500    let omp_num_threads = if threads > 1 && m >= 65_536 {
6501        usize::try_from(threads)
6502            .expect("threads must be non-negative")
6503            .min(thread_state.len())
6504            .max(1)
6505    } else {
6506        1
6507    };
6508    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
6509
6510    if omp_num_threads == 1 {
6511        let omp_block_start = 0usize;
6512        let omp_block_size = m_usize - omp_block_start;
6513        name = renumber_distinct_lms_suffixes_32s_4k(
6514            sa,
6515            m,
6516            1,
6517            omp_block_start as FastSint,
6518            omp_block_size as FastSint,
6519        );
6520    } else {
6521        for omp_thread_num in 0..omp_num_threads {
6522            let omp_block_start = omp_thread_num * omp_block_stride;
6523            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6524                omp_block_stride
6525            } else {
6526                m_usize - omp_block_start
6527            };
6528            thread_state[omp_thread_num].count = count_negative_marked_suffixes(
6529                sa,
6530                omp_block_start as FastSint,
6531                omp_block_size as FastSint,
6532            ) as FastSint;
6533        }
6534
6535        for omp_thread_num in 0..omp_num_threads {
6536            let omp_block_start = omp_thread_num * omp_block_stride;
6537            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6538                omp_block_stride
6539            } else {
6540                m_usize - omp_block_start
6541            };
6542
6543            let mut count: FastSint = 1;
6544            for t in 0..omp_thread_num {
6545                count += thread_state[t].count;
6546            }
6547
6548            if omp_thread_num + 1 == omp_num_threads {
6549                name = (count + thread_state[omp_thread_num].count) as SaSint;
6550            }
6551
6552            let _ = renumber_distinct_lms_suffixes_32s_4k(
6553                sa,
6554                m,
6555                count as SaSint,
6556                omp_block_start as FastSint,
6557                omp_block_size as FastSint,
6558            );
6559        }
6560    }
6561
6562    name - 1
6563}
6564
6565/// Internal helper: mark distinct lms suffixes 32s (OpenMP variant).
6566#[doc(hidden)]
6567pub fn mark_distinct_lms_suffixes_32s_omp(
6568    sa: &mut [SaSint],
6569    n: SaSint,
6570    m: SaSint,
6571    threads: SaSint,
6572) {
6573    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6574    let omp_num_threads = if threads > 1 && n >= 131_072 {
6575        usize::try_from(threads)
6576            .expect("threads must be non-negative")
6577            .max(1)
6578    } else {
6579        1
6580    };
6581    let omp_block_stride = (half_n / omp_num_threads) & !15usize;
6582
6583    for omp_thread_num in 0..omp_num_threads {
6584        let omp_block_start = omp_thread_num * omp_block_stride;
6585        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6586            omp_block_stride
6587        } else {
6588            half_n - omp_block_start
6589        };
6590        mark_distinct_lms_suffixes_32s(
6591            sa,
6592            m,
6593            omp_block_start as FastSint,
6594            omp_block_size as FastSint,
6595        );
6596    }
6597}
6598
6599/// Internal helper: clamp lms suffixes length 32s (OpenMP variant).
6600#[doc(hidden)]
6601pub fn clamp_lms_suffixes_length_32s_omp(sa: &mut [SaSint], n: SaSint, m: SaSint, threads: SaSint) {
6602    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6603    let omp_num_threads = if threads > 1 && n >= 131_072 {
6604        usize::try_from(threads)
6605            .expect("threads must be non-negative")
6606            .max(1)
6607    } else {
6608        1
6609    };
6610    let omp_block_stride = (half_n / omp_num_threads) & !15usize;
6611
6612    for omp_thread_num in 0..omp_num_threads {
6613        let omp_block_start = omp_thread_num * omp_block_stride;
6614        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6615            omp_block_stride
6616        } else {
6617            half_n - omp_block_start
6618        };
6619        clamp_lms_suffixes_length_32s(
6620            sa,
6621            m,
6622            omp_block_start as FastSint,
6623            omp_block_size as FastSint,
6624        );
6625    }
6626}
6627
6628/// Internal helper: renumber and mark distinct lms suffixes 32s 4k (OpenMP variant).
6629#[doc(hidden)]
6630pub fn renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
6631    sa: &mut [SaSint],
6632    n: SaSint,
6633    m: SaSint,
6634    threads: SaSint,
6635    thread_state: &mut [ThreadState],
6636) -> SaSint {
6637    let m_usize = usize::try_from(m).expect("m must be non-negative");
6638    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6639    sa[m_usize..m_usize + half_n].fill(0);
6640
6641    let name = renumber_distinct_lms_suffixes_32s_4k_omp(sa, m, threads, thread_state);
6642    if name < m {
6643        mark_distinct_lms_suffixes_32s_omp(sa, n, m, threads);
6644    }
6645
6646    name
6647}
6648
6649/// Internal helper: renumber and mark distinct lms suffixes 32s 1k (OpenMP variant).
6650#[doc(hidden)]
6651pub fn renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
6652    t: &[SaSint],
6653    sa: &mut [SaSint],
6654    n: SaSint,
6655    m: SaSint,
6656    threads: SaSint,
6657) -> SaSint {
6658    let m_usize = usize::try_from(m).expect("m must be non-negative");
6659    let n_usize = usize::try_from(n).expect("n must be non-negative");
6660
6661    let _ = gather_lms_suffixes_32s(t, sa, n);
6662
6663    let zero_len = n_usize
6664        .checked_sub(m_usize)
6665        .and_then(|v| v.checked_sub(m_usize))
6666        .expect("n must be at least 2*m");
6667    sa[m_usize..m_usize + zero_len].fill(0);
6668
6669    {
6670        let prefetch_distance: FastSint = 64;
6671        let mut i = n as FastSint - m as FastSint;
6672        let mut j = n as FastSint - 1 - prefetch_distance - 3;
6673
6674        while i < j {
6675            let iu = i as usize;
6676            let s0 = (sa[iu] as SaUint >> 1) as usize;
6677            let s1 = (sa[iu + 1] as SaUint >> 1) as usize;
6678            let s2 = (sa[iu + 2] as SaUint >> 1) as usize;
6679            let s3 = (sa[iu + 3] as SaUint >> 1) as usize;
6680
6681            sa[m_usize + s0] = sa[iu + 1] - sa[iu] + 1 + SAINT_MIN;
6682            sa[m_usize + s1] = sa[iu + 2] - sa[iu + 1] + 1 + SAINT_MIN;
6683            sa[m_usize + s2] = sa[iu + 3] - sa[iu + 2] + 1 + SAINT_MIN;
6684            sa[m_usize + s3] = sa[iu + 4] - sa[iu + 3] + 1 + SAINT_MIN;
6685            i += 4;
6686        }
6687
6688        j += prefetch_distance + 3;
6689        while i < j {
6690            let iu = i as usize;
6691            let s = (sa[iu] as SaUint >> 1) as usize;
6692            sa[m_usize + s] = sa[iu + 1] - sa[iu] + 1 + SAINT_MIN;
6693            i += 1;
6694        }
6695
6696        let tail = (sa[n_usize - 1] as SaUint >> 1) as usize;
6697        sa[m_usize + tail] = 1 + SAINT_MIN;
6698    }
6699
6700    clamp_lms_suffixes_length_32s_omp(sa, n, m, threads);
6701
6702    let mut name = 1;
6703    if m_usize > 0 {
6704        let (sa_head, sam) = sa.split_at_mut(m_usize);
6705        let mut i = 1usize;
6706        let prefetch_distance = 64usize;
6707        let mut j = m_usize.saturating_sub(prefetch_distance + 1);
6708        let mut p = usize::try_from(sa_head[0]).expect("suffix index must be non-negative");
6709        let mut plen = sam[p >> 1];
6710        let mut pdiff = SAINT_MIN;
6711
6712        while i < j {
6713            let q = usize::try_from(sa_head[i]).expect("suffix index must be non-negative");
6714            let qlen = sam[q >> 1];
6715            let mut qdiff = SAINT_MIN;
6716            if plen == qlen {
6717                let mut l = 0usize;
6718                while l < qlen as usize {
6719                    if t[p + l] != t[q + l] {
6720                        break;
6721                    }
6722                    l += 1;
6723                }
6724                qdiff = ((l as SaSint) - qlen) & SAINT_MIN;
6725            }
6726            sam[p >> 1] = name | (pdiff & qdiff);
6727            name += SaSint::from(qdiff < 0);
6728
6729            p = usize::try_from(sa_head[i + 1]).expect("suffix index must be non-negative");
6730            plen = sam[p >> 1];
6731            pdiff = SAINT_MIN;
6732            if qlen == plen {
6733                let mut l = 0usize;
6734                while l < plen as usize {
6735                    if t[q + l] != t[p + l] {
6736                        break;
6737                    }
6738                    l += 1;
6739                }
6740                pdiff = ((l as SaSint) - plen) & SAINT_MIN;
6741            }
6742            sam[q >> 1] = name | (qdiff & pdiff);
6743            name += SaSint::from(pdiff < 0);
6744            i += 2;
6745        }
6746
6747        j = m_usize;
6748        while i < j {
6749            let q = usize::try_from(sa_head[i]).expect("suffix index must be non-negative");
6750            let qlen = sam[q >> 1];
6751            let mut qdiff = SAINT_MIN;
6752            if plen == qlen {
6753                let mut l = 0usize;
6754                while l < plen as usize {
6755                    if t[p + l] != t[q + l] {
6756                        break;
6757                    }
6758                    l += 1;
6759                }
6760                qdiff = ((l as SaSint) - plen) & SAINT_MIN;
6761            }
6762            sam[p >> 1] = name | (pdiff & qdiff);
6763            name += SaSint::from(qdiff < 0);
6764
6765            p = q;
6766            plen = qlen;
6767            pdiff = qdiff;
6768            i += 1;
6769        }
6770
6771        sam[p >> 1] = name | pdiff;
6772        name += 1;
6773    }
6774
6775    if name <= m {
6776        mark_distinct_lms_suffixes_32s_omp(sa, n, m, threads);
6777    }
6778
6779    name - 1
6780}
6781
6782/// Internal helper: reconstruct lms suffixes.
6783#[doc(hidden)]
6784pub fn reconstruct_lms_suffixes(
6785    sa: &mut [SaSint],
6786    n: SaSint,
6787    m: SaSint,
6788    omp_block_start: FastSint,
6789    omp_block_size: FastSint,
6790) {
6791    if omp_block_size <= 0 {
6792        return;
6793    }
6794
6795    let prefetch_distance: FastSint = 64;
6796    let base = (n - m) as usize;
6797    let mut i = omp_block_start;
6798    let mut j = omp_block_start + omp_block_size - prefetch_distance - 3;
6799
6800    while i < j {
6801        let iu = i as usize;
6802        let s0 = sa[iu] as usize;
6803        let s1 = sa[iu + 1] as usize;
6804        let s2 = sa[iu + 2] as usize;
6805        let s3 = sa[iu + 3] as usize;
6806        sa[iu] = sa[base + s0];
6807        sa[iu + 1] = sa[base + s1];
6808        sa[iu + 2] = sa[base + s2];
6809        sa[iu + 3] = sa[base + s3];
6810        i += 4;
6811    }
6812
6813    j += prefetch_distance + 3;
6814    while i < j {
6815        let iu = i as usize;
6816        let s = sa[iu] as usize;
6817        sa[iu] = sa[base + s];
6818        i += 1;
6819    }
6820}
6821
6822/// Internal helper: reconstruct lms suffixes (OpenMP variant).
6823#[doc(hidden)]
6824pub fn reconstruct_lms_suffixes_omp(sa: &mut [SaSint], n: SaSint, m: SaSint, threads: SaSint) {
6825    let m_usize = usize::try_from(m).expect("m must be non-negative");
6826    let omp_num_threads = if threads > 1 && m >= 65_536 {
6827        usize::try_from(threads)
6828            .expect("threads must be non-negative")
6829            .max(1)
6830    } else {
6831        1
6832    };
6833    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
6834
6835    for omp_thread_num in 0..omp_num_threads {
6836        let omp_block_start = omp_thread_num * omp_block_stride;
6837        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6838            omp_block_stride
6839        } else {
6840            m_usize - omp_block_start
6841        };
6842        reconstruct_lms_suffixes(
6843            sa,
6844            n,
6845            m,
6846            omp_block_start as FastSint,
6847            omp_block_size as FastSint,
6848        );
6849    }
6850}
6851
6852/// Internal helper: place lms suffixes interval 8u.
6853#[doc(hidden)]
6854pub fn place_lms_suffixes_interval_8u(
6855    sa: &mut [SaSint],
6856    n: SaSint,
6857    mut m: SaSint,
6858    flags: SaSint,
6859    buckets: &mut [SaSint],
6860) {
6861    let bucket_end_base = 7 * ALPHABET_SIZE;
6862    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
6863        buckets[bucket_end_base] -= 1;
6864    }
6865
6866    let mut j = usize::try_from(n).expect("n must be non-negative");
6867    for c in (0..ALPHABET_SIZE - 1).rev() {
6868        let l = usize::try_from(
6869            buckets[buckets_index2(c, 1) + buckets_index2(1, 0)] - buckets[buckets_index2(c, 1)],
6870        )
6871        .expect("interval length must be non-negative");
6872        if l > 0 {
6873            let i = usize::try_from(buckets[bucket_end_base + c])
6874                .expect("bucket end must be non-negative");
6875            if j > i {
6876                sa[i..j].fill(0);
6877            }
6878
6879            let new_j = i - l;
6880            let src_end = usize::try_from(m).expect("m must be non-negative");
6881            let src_start = src_end - l;
6882            sa.copy_within(src_start..src_end, new_j);
6883            m -= l as SaSint;
6884            j = new_j;
6885        }
6886    }
6887
6888    sa[..j].fill(0);
6889
6890    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
6891        buckets[bucket_end_base] += 1;
6892    }
6893}
6894
6895/// Internal helper: place lms suffixes interval 32s 4k.
6896#[doc(hidden)]
6897pub fn place_lms_suffixes_interval_32s_4k(
6898    sa: &mut [SaSint],
6899    n: SaSint,
6900    k: SaSint,
6901    mut m: SaSint,
6902    buckets: &[SaSint],
6903) {
6904    let k_usize = usize::try_from(k).expect("k must be non-negative");
6905    let bucket_end = &buckets[3 * k_usize..4 * k_usize];
6906
6907    let mut j = usize::try_from(n).expect("n must be non-negative");
6908    for c in (0..k_usize - 1).rev() {
6909        let l = usize::try_from(
6910            buckets[buckets_index2(c, 1) + buckets_index2(1, 0)] - buckets[buckets_index2(c, 1)],
6911        )
6912        .expect("interval length must be non-negative");
6913        if l > 0 {
6914            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
6915            if j > i {
6916                sa[i..j].fill(0);
6917            }
6918
6919            let new_j = i - l;
6920            let src_end = usize::try_from(m).expect("m must be non-negative");
6921            let src_start = src_end - l;
6922            sa.copy_within(src_start..src_end, new_j);
6923            m -= l as SaSint;
6924            j = new_j;
6925        }
6926    }
6927
6928    sa[..j].fill(0);
6929}
6930
6931/// Internal helper: place lms suffixes interval 32s 2k.
6932#[doc(hidden)]
6933pub fn place_lms_suffixes_interval_32s_2k(
6934    sa: &mut [SaSint],
6935    n: SaSint,
6936    k: SaSint,
6937    mut m: SaSint,
6938    buckets: &[SaSint],
6939) {
6940    let k_usize = usize::try_from(k).expect("k must be non-negative");
6941    let mut j = usize::try_from(n).expect("n must be non-negative");
6942
6943    if k_usize > 1 {
6944        let mut c = buckets_index2(k_usize - 2, 0) as isize;
6945        while c >= buckets_index2(0, 0) as isize {
6946            let c_usize = c as usize;
6947            let l = usize::try_from(
6948                buckets[c_usize + buckets_index2(1, 1)] - buckets[c_usize + buckets_index2(0, 1)],
6949            )
6950            .expect("interval length must be non-negative");
6951            if l > 0 {
6952                let i =
6953                    usize::try_from(buckets[c_usize]).expect("bucket start must be non-negative");
6954                if j > i {
6955                    sa[i..j].fill(0);
6956                }
6957
6958                let new_j = i - l;
6959                let src_end = usize::try_from(m).expect("m must be non-negative");
6960                let src_start = src_end - l;
6961                sa.copy_within(src_start..src_end, new_j);
6962                m -= l as SaSint;
6963                j = new_j;
6964            }
6965            c -= buckets_index2(1, 0) as isize;
6966        }
6967    }
6968
6969    sa[..j].fill(0);
6970}
6971
6972/// Internal helper: place lms suffixes interval 32s 1k.
6973#[doc(hidden)]
6974pub fn place_lms_suffixes_interval_32s_1k(
6975    t: &[SaSint],
6976    sa: &mut [SaSint],
6977    k: SaSint,
6978    m: SaSint,
6979    buckets: &[SaSint],
6980) {
6981    let mut c = k - 1;
6982    let c_usize = usize::try_from(c).expect("k must be positive");
6983    let mut l = usize::try_from(buckets[c_usize]).expect("bucket end must be non-negative");
6984
6985    let m_usize = usize::try_from(m).expect("m must be non-negative");
6986    for i in (0..m_usize).rev() {
6987        let p = usize::try_from(sa[i]).expect("suffix index must be non-negative");
6988        let tp = t[p];
6989        if tp != c {
6990            c = tp;
6991            let bucket = usize::try_from(c).expect("bucket index must be non-negative");
6992            let bucket_pos =
6993                usize::try_from(buckets[bucket]).expect("bucket end must be non-negative");
6994            if l > bucket_pos {
6995                sa[bucket_pos..l].fill(0);
6996            }
6997            l = bucket_pos;
6998        }
6999        l -= 1;
7000        sa[l] = p as SaSint;
7001    }
7002
7003    sa[..l].fill(0);
7004}
7005
7006/// Internal helper: place lms suffixes histogram 32s 6k.
7007#[doc(hidden)]
7008pub fn place_lms_suffixes_histogram_32s_6k(
7009    sa: &mut [SaSint],
7010    n: SaSint,
7011    k: SaSint,
7012    mut m: SaSint,
7013    buckets: &[SaSint],
7014) {
7015    let k_usize = usize::try_from(k).expect("k must be non-negative");
7016    let bucket_end = &buckets[5 * k_usize..6 * k_usize];
7017
7018    let mut j = usize::try_from(n).expect("n must be non-negative");
7019    for c in (0..k_usize - 1).rev() {
7020        let l = usize::try_from(buckets[buckets_index4(c, 1)])
7021            .expect("histogram length must be non-negative");
7022        if l > 0 {
7023            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
7024            if j > i {
7025                sa[i..j].fill(0);
7026            }
7027
7028            let new_j = i - l;
7029            let src_end = usize::try_from(m).expect("m must be non-negative");
7030            let src_start = src_end - l;
7031            sa.copy_within(src_start..src_end, new_j);
7032            m -= l as SaSint;
7033            j = new_j;
7034        }
7035    }
7036
7037    sa[..j].fill(0);
7038}
7039
7040/// Internal helper: place lms suffixes histogram 32s 4k.
7041#[doc(hidden)]
7042pub fn place_lms_suffixes_histogram_32s_4k(
7043    sa: &mut [SaSint],
7044    n: SaSint,
7045    k: SaSint,
7046    mut m: SaSint,
7047    buckets: &[SaSint],
7048) {
7049    let k_usize = usize::try_from(k).expect("k must be non-negative");
7050    let bucket_end = &buckets[3 * k_usize..4 * k_usize];
7051
7052    let mut j = usize::try_from(n).expect("n must be non-negative");
7053    for c in (0..k_usize - 1).rev() {
7054        let l = usize::try_from(buckets[buckets_index2(c, 1)])
7055            .expect("histogram length must be non-negative");
7056        if l > 0 {
7057            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
7058            if j > i {
7059                sa[i..j].fill(0);
7060            }
7061
7062            let new_j = i - l;
7063            let src_end = usize::try_from(m).expect("m must be non-negative");
7064            let src_start = src_end - l;
7065            sa.copy_within(src_start..src_end, new_j);
7066            m -= l as SaSint;
7067            j = new_j;
7068        }
7069    }
7070
7071    sa[..j].fill(0);
7072}
7073
7074/// Internal helper: place lms suffixes histogram 32s 2k.
7075#[doc(hidden)]
7076pub fn place_lms_suffixes_histogram_32s_2k(
7077    sa: &mut [SaSint],
7078    n: SaSint,
7079    k: SaSint,
7080    mut m: SaSint,
7081    buckets: &[SaSint],
7082) {
7083    let k_usize = usize::try_from(k).expect("k must be non-negative");
7084    let mut j = usize::try_from(n).expect("n must be non-negative");
7085
7086    if k_usize > 1 {
7087        let mut c = buckets_index2(k_usize - 2, 0) as isize;
7088        while c >= buckets_index2(0, 0) as isize {
7089            let c_usize = c as usize;
7090            let l = usize::try_from(buckets[c_usize + buckets_index2(0, 1)])
7091                .expect("histogram length must be non-negative");
7092            if l > 0 {
7093                let i =
7094                    usize::try_from(buckets[c_usize]).expect("bucket start must be non-negative");
7095                if j > i {
7096                    sa[i..j].fill(0);
7097                }
7098
7099                let new_j = i - l;
7100                let src_end = usize::try_from(m).expect("m must be non-negative");
7101                let src_start = src_end - l;
7102                sa.copy_within(src_start..src_end, new_j);
7103                m -= l as SaSint;
7104                j = new_j;
7105            }
7106            c -= buckets_index2(1, 0) as isize;
7107        }
7108    }
7109
7110    sa[..j].fill(0);
7111}
7112
7113/// Internal helper: final bwt scan left to right 8u.
7114#[doc(hidden)]
7115pub fn final_bwt_scan_left_to_right_8u(
7116    t: &[u8],
7117    sa: &mut [SaSint],
7118    induction_bucket: &mut [SaSint],
7119    omp_block_start: FastSint,
7120    omp_block_size: FastSint,
7121) {
7122    if omp_block_size <= 0 {
7123        return;
7124    }
7125
7126    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7127    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7128    for i in start..start + size {
7129        let mut p = sa[i];
7130        sa[i] = p & SAINT_MAX;
7131        if p > 0 {
7132            p -= 1;
7133            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7134            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7135            let bucket = t[p_usize] as usize;
7136            let slot = usize::try_from(induction_bucket[bucket])
7137                .expect("bucket slot must be non-negative");
7138            sa[slot] = p
7139                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7140                    << (SAINT_BIT - 1));
7141            induction_bucket[bucket] += 1;
7142        }
7143    }
7144}
7145
7146/// Internal helper: final bwt aux scan left to right 8u.
7147#[doc(hidden)]
7148pub fn final_bwt_aux_scan_left_to_right_8u(
7149    t: &[u8],
7150    sa: &mut [SaSint],
7151    rm: SaSint,
7152    i_out: &mut [SaSint],
7153    induction_bucket: &mut [SaSint],
7154    omp_block_start: FastSint,
7155    omp_block_size: FastSint,
7156) {
7157    if omp_block_size <= 0 {
7158        return;
7159    }
7160
7161    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7162    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7163    for i in start..start + size {
7164        let mut p = sa[i];
7165        sa[i] = p & SAINT_MAX;
7166        if p > 0 {
7167            p -= 1;
7168            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7169            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7170            let bucket = t[p_usize] as usize;
7171            let slot = usize::try_from(induction_bucket[bucket])
7172                .expect("bucket slot must be non-negative");
7173            sa[slot] = p
7174                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7175                    << (SAINT_BIT - 1));
7176            induction_bucket[bucket] += 1;
7177            if (p & rm) == 0 {
7178                let out_idx =
7179                    usize::try_from(p / (rm + 1)).expect("sample index must be non-negative");
7180                i_out[out_idx] = induction_bucket[bucket];
7181            }
7182        }
7183    }
7184}
7185
7186/// Internal helper: final sorting scan left to right 8u.
7187#[doc(hidden)]
7188pub fn final_sorting_scan_left_to_right_8u(
7189    t: &[u8],
7190    sa: &mut [SaSint],
7191    induction_bucket: &mut [SaSint],
7192    omp_block_start: FastSint,
7193    omp_block_size: FastSint,
7194) {
7195    if omp_block_size <= 0 {
7196        return;
7197    }
7198
7199    let prefetch_distance = 64usize;
7200    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7201    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7202
7203    let mut i = start;
7204    let mut j = if size > prefetch_distance + 1 {
7205        start + size - (prefetch_distance + 1)
7206    } else {
7207        start
7208    };
7209    while i < j {
7210        let mut p0 = sa[i];
7211        sa[i] = p0 ^ SAINT_MIN;
7212        if p0 > 0 {
7213            p0 -= 1;
7214            let p0_usize = p0 as usize;
7215            let bucket0 = t[p0_usize] as usize;
7216            let slot0 = induction_bucket[bucket0] as usize;
7217            sa[slot0] = p0
7218                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] < t[p0_usize]) as SaSint)
7219                    << (SAINT_BIT - 1));
7220            induction_bucket[bucket0] += 1;
7221        }
7222
7223        let mut p1 = sa[i + 1];
7224        sa[i + 1] = p1 ^ SAINT_MIN;
7225        if p1 > 0 {
7226            p1 -= 1;
7227            let p1_usize = p1 as usize;
7228            let bucket1 = t[p1_usize] as usize;
7229            let slot1 = induction_bucket[bucket1] as usize;
7230            sa[slot1] = p1
7231                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] < t[p1_usize]) as SaSint)
7232                    << (SAINT_BIT - 1));
7233            induction_bucket[bucket1] += 1;
7234        }
7235
7236        i += 2;
7237    }
7238
7239    j = start + size;
7240    while i < j {
7241        let mut p = sa[i];
7242        sa[i] = p ^ SAINT_MIN;
7243        if p > 0 {
7244            p -= 1;
7245            let p_usize = p as usize;
7246            let bucket = t[p_usize] as usize;
7247            let slot = induction_bucket[bucket] as usize;
7248            sa[slot] = p
7249                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7250                    << (SAINT_BIT - 1));
7251            induction_bucket[bucket] += 1;
7252        }
7253        i += 1;
7254    }
7255}
7256
7257/// Internal helper: final sorting scan left to right 32s.
7258#[doc(hidden)]
7259pub fn final_sorting_scan_left_to_right_32s(
7260    t: &[SaSint],
7261    sa: &mut [SaSint],
7262    induction_bucket: &mut [SaSint],
7263    omp_block_start: FastSint,
7264    omp_block_size: FastSint,
7265) {
7266    if omp_block_size <= 0 {
7267        return;
7268    }
7269
7270    let prefetch_distance: FastSint = 64;
7271    let mut i = omp_block_start;
7272    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
7273
7274    while i < j {
7275        let i0 = i as usize;
7276        let mut p0 = sa[i0];
7277        sa[i0] = p0 ^ SAINT_MIN;
7278        if p0 > 0 {
7279            p0 -= 1;
7280            let p0u = p0 as usize;
7281            let bucket0 = t[p0u] as usize;
7282            let slot0 = induction_bucket[bucket0] as usize;
7283            sa[slot0] = p0
7284                | ((usize::from(t[p0u - usize::from(p0 > 0)] < t[p0u]) as SaSint)
7285                    << (SAINT_BIT - 1));
7286            induction_bucket[bucket0] += 1;
7287        }
7288
7289        let i1 = (i + 1) as usize;
7290        let mut p1 = sa[i1];
7291        sa[i1] = p1 ^ SAINT_MIN;
7292        if p1 > 0 {
7293            p1 -= 1;
7294            let p1u = p1 as usize;
7295            let bucket1 = t[p1u] as usize;
7296            let slot1 = induction_bucket[bucket1] as usize;
7297            sa[slot1] = p1
7298                | ((usize::from(t[p1u - usize::from(p1 > 0)] < t[p1u]) as SaSint)
7299                    << (SAINT_BIT - 1));
7300            induction_bucket[bucket1] += 1;
7301        }
7302        i += 2;
7303    }
7304
7305    j += 2 * prefetch_distance + 1;
7306    while i < j {
7307        let iu = i as usize;
7308        let mut p = sa[iu];
7309        sa[iu] = p ^ SAINT_MIN;
7310        if p > 0 {
7311            p -= 1;
7312            let pu = p as usize;
7313            let bucket = t[pu] as usize;
7314            let slot = induction_bucket[bucket] as usize;
7315            sa[slot] = p
7316                | ((usize::from(t[pu - usize::from(p > 0)] < t[pu]) as SaSint) << (SAINT_BIT - 1));
7317            induction_bucket[bucket] += 1;
7318        }
7319        i += 1;
7320    }
7321}
7322
7323/// Internal helper: final bwt scan left to right 8u block prepare.
7324#[doc(hidden)]
7325pub fn final_bwt_scan_left_to_right_8u_block_prepare(
7326    t: &[u8],
7327    sa: &mut [SaSint],
7328    k: SaSint,
7329    buckets: &mut [SaSint],
7330    cache: &mut [ThreadCache],
7331    omp_block_start: FastSint,
7332    omp_block_size: FastSint,
7333) -> FastSint {
7334    if omp_block_size <= 0 {
7335        return 0;
7336    }
7337
7338    let k_usize = usize::try_from(k).expect("k must be non-negative");
7339    buckets[..k_usize].fill(0);
7340
7341    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7342    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7343    let mut count = 0usize;
7344    for i in start..start + size {
7345        let mut p = sa[i];
7346        sa[i] = p & SAINT_MAX;
7347        if p > 0 {
7348            p -= 1;
7349            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7350            let symbol = t[p_usize] as usize;
7351            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7352            buckets[symbol] += 1;
7353            cache[count].symbol = symbol as SaSint;
7354            cache[count].index = p
7355                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7356                    << (SAINT_BIT - 1));
7357            count += 1;
7358        }
7359    }
7360
7361    count as FastSint
7362}
7363
7364/// Internal helper: final sorting scan left to right 8u block prepare.
7365#[doc(hidden)]
7366pub fn final_sorting_scan_left_to_right_8u_block_prepare(
7367    t: &[u8],
7368    sa: &mut [SaSint],
7369    k: SaSint,
7370    buckets: &mut [SaSint],
7371    cache: &mut [ThreadCache],
7372    omp_block_start: FastSint,
7373    omp_block_size: FastSint,
7374) -> FastSint {
7375    if omp_block_size <= 0 {
7376        return 0;
7377    }
7378
7379    let k_usize = usize::try_from(k).expect("k must be non-negative");
7380    buckets[..k_usize].fill(0);
7381
7382    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7383    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7384    let mut count = 0usize;
7385    for i in start..start + size {
7386        let mut p = sa[i];
7387        sa[i] = p ^ SAINT_MIN;
7388        if p > 0 {
7389            p -= 1;
7390            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7391            let symbol = t[p_usize] as usize;
7392            buckets[symbol] += 1;
7393            cache[count].symbol = symbol as SaSint;
7394            cache[count].index = p
7395                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7396                    << (SAINT_BIT - 1));
7397            count += 1;
7398        }
7399    }
7400
7401    count as FastSint
7402}
7403
7404/// Internal helper: final order scan left to right 8u block place.
7405#[doc(hidden)]
7406pub fn final_order_scan_left_to_right_8u_block_place(
7407    sa: &mut [SaSint],
7408    buckets: &mut [SaSint],
7409    cache: &[ThreadCache],
7410    count: FastSint,
7411) {
7412    if count <= 0 {
7413        return;
7414    }
7415
7416    let count_usize = usize::try_from(count).expect("count must be non-negative");
7417    for entry in &cache[..count_usize] {
7418        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
7419        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
7420        sa[slot] = entry.index;
7421        buckets[symbol] += 1;
7422    }
7423}
7424
7425/// Internal helper: final bwt aux scan left to right 8u block place.
7426#[doc(hidden)]
7427pub fn final_bwt_aux_scan_left_to_right_8u_block_place(
7428    sa: &mut [SaSint],
7429    rm: SaSint,
7430    i_out: &mut [SaSint],
7431    buckets: &mut [SaSint],
7432    cache: &[ThreadCache],
7433    count: FastSint,
7434) {
7435    if count <= 0 {
7436        return;
7437    }
7438
7439    let count_usize = usize::try_from(count).expect("count must be non-negative");
7440    for entry in &cache[..count_usize] {
7441        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
7442        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
7443        sa[slot] = entry.index;
7444        buckets[symbol] += 1;
7445        if (entry.index & rm) == 0 {
7446            let sample_index = usize::try_from((entry.index & SAINT_MAX) / (rm + 1))
7447                .expect("sample index must be non-negative");
7448            i_out[sample_index] = buckets[symbol];
7449        }
7450    }
7451}
7452
7453/// Internal helper: final sorting scan left to right 32s block gather.
7454#[doc(hidden)]
7455pub fn final_sorting_scan_left_to_right_32s_block_gather(
7456    t: &[SaSint],
7457    sa: &mut [SaSint],
7458    cache: &mut [ThreadCache],
7459    omp_block_start: FastSint,
7460    omp_block_size: FastSint,
7461) {
7462    if omp_block_size <= 0 {
7463        return;
7464    }
7465    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7466    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7467    for offset in 0..size {
7468        let i = start + offset;
7469        let mut symbol = SAINT_MIN;
7470        let mut p = sa[i];
7471        sa[i] = p ^ SAINT_MIN;
7472        if p > 0 {
7473            p -= 1;
7474            let p_usize = p as usize;
7475            cache[offset].index = p
7476                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7477                    << (SAINT_BIT - 1));
7478            symbol = t[p_usize];
7479        }
7480        cache[offset].symbol = symbol;
7481    }
7482}
7483
7484/// Internal helper: final sorting scan left to right 32s block sort.
7485#[doc(hidden)]
7486pub fn final_sorting_scan_left_to_right_32s_block_sort(
7487    t: &[SaSint],
7488    induction_bucket: &mut [SaSint],
7489    cache: &mut [ThreadCache],
7490    omp_block_start: FastSint,
7491    omp_block_size: FastSint,
7492) {
7493    if omp_block_size <= 0 {
7494        return;
7495    }
7496    let prefetch_distance = 64usize;
7497    let start = omp_block_start as usize;
7498    let block_end = start + omp_block_size as usize;
7499    let mut i = start;
7500    let mut j = start + (omp_block_size as usize).saturating_sub(prefetch_distance + 1);
7501
7502    while i < j {
7503        let ci = i - start;
7504        let v0 = cache[ci].symbol;
7505        if v0 >= 0 {
7506            let bucket_index0 = v0 as usize;
7507            cache[ci].symbol = induction_bucket[bucket_index0];
7508            induction_bucket[bucket_index0] += 1;
7509            if cache[ci].symbol < block_end as SaSint {
7510                let ni = cache[ci].symbol as usize;
7511                let cni = ni - start;
7512                let mut np = cache[ci].index;
7513                cache[ci].index = np ^ SAINT_MIN;
7514                if np > 0 {
7515                    np -= 1;
7516                    let np_usize = np as usize;
7517                    cache[cni].index = np
7518                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7519                            as SaSint)
7520                            << (SAINT_BIT - 1));
7521                    cache[cni].symbol = t[np_usize];
7522                }
7523            }
7524        }
7525
7526        let i1 = i + 1;
7527        let ci1 = i1 - start;
7528        let v1 = cache[ci1].symbol;
7529        if v1 >= 0 {
7530            let bucket_index1 = v1 as usize;
7531            cache[ci1].symbol = induction_bucket[bucket_index1];
7532            induction_bucket[bucket_index1] += 1;
7533            if cache[ci1].symbol < block_end as SaSint {
7534                let ni = cache[ci1].symbol as usize;
7535                let cni = ni - start;
7536                let mut np = cache[ci1].index;
7537                cache[ci1].index = np ^ SAINT_MIN;
7538                if np > 0 {
7539                    np -= 1;
7540                    let np_usize = np as usize;
7541                    cache[cni].index = np
7542                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7543                            as SaSint)
7544                            << (SAINT_BIT - 1));
7545                    cache[cni].symbol = t[np_usize];
7546                }
7547            }
7548        }
7549
7550        i += 2;
7551    }
7552
7553    j = block_end;
7554    while i < j {
7555        let ci = i - start;
7556        let v = cache[ci].symbol;
7557        if v >= 0 {
7558            let bucket_index = v as usize;
7559            cache[ci].symbol = induction_bucket[bucket_index];
7560            induction_bucket[bucket_index] += 1;
7561            if cache[ci].symbol < block_end as SaSint {
7562                let ni = cache[ci].symbol as usize;
7563                let cni = ni - start;
7564                let mut np = cache[ci].index;
7565                cache[ci].index = np ^ SAINT_MIN;
7566                if np > 0 {
7567                    np -= 1;
7568                    let np_usize = np as usize;
7569                    cache[cni].index = np
7570                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7571                            as SaSint)
7572                            << (SAINT_BIT - 1));
7573                    cache[cni].symbol = t[np_usize];
7574                }
7575            }
7576        }
7577        i += 1;
7578    }
7579}
7580
7581/// Internal helper: final bwt scan left to right 8u block (OpenMP variant).
7582#[doc(hidden)]
7583pub fn final_bwt_scan_left_to_right_8u_block_omp(
7584    t: &[u8],
7585    sa: &mut [SaSint],
7586    k: SaSint,
7587    induction_bucket: &mut [SaSint],
7588    block_start: FastSint,
7589    block_size: FastSint,
7590    threads: SaSint,
7591    thread_state: &mut [ThreadState],
7592) {
7593    if block_size <= 0 {
7594        return;
7595    }
7596
7597    let k_usize = usize::try_from(k).expect("k must be non-negative");
7598    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7599    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7600        usize::try_from(threads)
7601            .expect("threads must be non-negative")
7602            .min(thread_state.len())
7603            .max(1)
7604    } else {
7605        1
7606    };
7607
7608    if omp_num_threads == 1 {
7609        final_bwt_scan_left_to_right_8u(t, sa, induction_bucket, block_start, block_size);
7610        return;
7611    }
7612
7613    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7614    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7615    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7616        let relative_start = thread_num * omp_block_stride;
7617        let size = if thread_num + 1 < omp_num_threads {
7618            omp_block_stride
7619        } else {
7620            block_size_usize - relative_start
7621        };
7622        state.count = final_bwt_scan_left_to_right_8u_block_prepare(
7623            t,
7624            sa,
7625            k,
7626            &mut state.buckets,
7627            &mut state.cache,
7628            (block_start_usize + relative_start) as FastSint,
7629            size as FastSint,
7630        );
7631    }
7632
7633    for state in thread_state.iter_mut().take(omp_num_threads) {
7634        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7635            let a = *bucket;
7636            let b = state.buckets[c];
7637            *bucket = a + b;
7638            state.buckets[c] = a;
7639        }
7640    }
7641
7642    for state in thread_state.iter_mut().take(omp_num_threads) {
7643        final_order_scan_left_to_right_8u_block_place(
7644            sa,
7645            &mut state.buckets,
7646            &state.cache,
7647            state.count,
7648        );
7649    }
7650}
7651
7652/// Internal helper: final bwt aux scan left to right 8u block (OpenMP variant).
7653#[doc(hidden)]
7654pub fn final_bwt_aux_scan_left_to_right_8u_block_omp(
7655    t: &[u8],
7656    sa: &mut [SaSint],
7657    k: SaSint,
7658    rm: SaSint,
7659    i_out: &mut [SaSint],
7660    induction_bucket: &mut [SaSint],
7661    block_start: FastSint,
7662    block_size: FastSint,
7663    threads: SaSint,
7664    thread_state: &mut [ThreadState],
7665) {
7666    if block_size <= 0 {
7667        return;
7668    }
7669
7670    let k_usize = usize::try_from(k).expect("k must be non-negative");
7671    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7672    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7673        usize::try_from(threads)
7674            .expect("threads must be non-negative")
7675            .min(thread_state.len())
7676            .max(1)
7677    } else {
7678        1
7679    };
7680
7681    if omp_num_threads == 1 {
7682        final_bwt_aux_scan_left_to_right_8u(
7683            t,
7684            sa,
7685            rm,
7686            i_out,
7687            induction_bucket,
7688            block_start,
7689            block_size,
7690        );
7691        return;
7692    }
7693
7694    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7695    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7696    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7697        let relative_start = thread_num * omp_block_stride;
7698        let size = if thread_num + 1 < omp_num_threads {
7699            omp_block_stride
7700        } else {
7701            block_size_usize - relative_start
7702        };
7703        state.count = final_bwt_scan_left_to_right_8u_block_prepare(
7704            t,
7705            sa,
7706            k,
7707            &mut state.buckets,
7708            &mut state.cache,
7709            (block_start_usize + relative_start) as FastSint,
7710            size as FastSint,
7711        );
7712    }
7713
7714    for state in thread_state.iter_mut().take(omp_num_threads) {
7715        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7716            let a = *bucket;
7717            let b = state.buckets[c];
7718            *bucket = a + b;
7719            state.buckets[c] = a;
7720        }
7721    }
7722
7723    for state in thread_state.iter_mut().take(omp_num_threads) {
7724        final_bwt_aux_scan_left_to_right_8u_block_place(
7725            sa,
7726            rm,
7727            i_out,
7728            &mut state.buckets,
7729            &state.cache,
7730            state.count,
7731        );
7732    }
7733}
7734
7735/// Internal helper: final sorting scan left to right 8u block (OpenMP variant).
7736#[doc(hidden)]
7737pub fn final_sorting_scan_left_to_right_8u_block_omp(
7738    t: &[u8],
7739    sa: &mut [SaSint],
7740    k: SaSint,
7741    induction_bucket: &mut [SaSint],
7742    block_start: FastSint,
7743    block_size: FastSint,
7744    threads: SaSint,
7745    thread_state: &mut [ThreadState],
7746) {
7747    if block_size <= 0 {
7748        return;
7749    }
7750
7751    let k_usize = usize::try_from(k).expect("k must be non-negative");
7752    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7753    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7754        usize::try_from(threads)
7755            .expect("threads must be non-negative")
7756            .min(thread_state.len())
7757            .max(1)
7758    } else {
7759        1
7760    };
7761
7762    if omp_num_threads == 1 {
7763        final_sorting_scan_left_to_right_8u(t, sa, induction_bucket, block_start, block_size);
7764        return;
7765    }
7766
7767    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7768    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7769    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7770        let relative_start = thread_num * omp_block_stride;
7771        let size = if thread_num + 1 < omp_num_threads {
7772            omp_block_stride
7773        } else {
7774            block_size_usize - relative_start
7775        };
7776        state.count = final_sorting_scan_left_to_right_8u_block_prepare(
7777            t,
7778            sa,
7779            k,
7780            &mut state.buckets,
7781            &mut state.cache,
7782            (block_start_usize + relative_start) as FastSint,
7783            size as FastSint,
7784        );
7785    }
7786
7787    for state in thread_state.iter_mut().take(omp_num_threads) {
7788        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7789            let a = *bucket;
7790            let b = state.buckets[c];
7791            *bucket = a + b;
7792            state.buckets[c] = a;
7793        }
7794    }
7795
7796    for state in thread_state.iter_mut().take(omp_num_threads) {
7797        final_order_scan_left_to_right_8u_block_place(
7798            sa,
7799            &mut state.buckets,
7800            &state.cache,
7801            state.count,
7802        );
7803    }
7804}
7805
7806/// Internal helper: final sorting scan left to right 32s block (OpenMP variant).
7807#[doc(hidden)]
7808pub fn final_sorting_scan_left_to_right_32s_block_omp(
7809    t: &[SaSint],
7810    sa: &mut [SaSint],
7811    buckets: &mut [SaSint],
7812    cache: &mut [ThreadCache],
7813    block_start: FastSint,
7814    block_size: FastSint,
7815    threads: SaSint,
7816) {
7817    if threads <= 1 || block_size < 16_384 {
7818        final_sorting_scan_left_to_right_32s(t, sa, buckets, block_start, block_size);
7819        return;
7820    }
7821
7822    final_sorting_scan_left_to_right_32s_block_gather(t, sa, cache, block_start, block_size);
7823    final_sorting_scan_left_to_right_32s_block_sort(t, buckets, cache, block_start, block_size);
7824    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7825    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
7826    let omp_num_threads = threads_usize.min(block_size_usize);
7827    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7828    for omp_thread_num in 0..omp_num_threads {
7829        let omp_block_start = omp_thread_num * omp_block_stride;
7830        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
7831            omp_block_stride
7832        } else {
7833            block_size_usize - omp_block_start
7834        };
7835        compact_and_place_cached_suffixes(
7836            sa,
7837            cache,
7838            omp_block_start as FastSint,
7839            omp_block_size as FastSint,
7840        );
7841    }
7842}
7843
7844/// Internal helper: final bwt scan left to right 8u (OpenMP variant).
7845#[doc(hidden)]
7846pub fn final_bwt_scan_left_to_right_8u_omp(
7847    t: &[u8],
7848    sa: &mut [SaSint],
7849    n: FastSint,
7850    k: SaSint,
7851    induction_bucket: &mut [SaSint],
7852    threads: SaSint,
7853    thread_state: &mut [ThreadState],
7854) {
7855    let n_usize = usize::try_from(n).expect("n must be non-negative");
7856    let last = n_usize - 1;
7857    let bucket = t[last] as usize;
7858    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
7859    sa[slot] =
7860        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
7861    induction_bucket[bucket] += 1;
7862
7863    if threads == 1 || n < 65_536 {
7864        final_bwt_scan_left_to_right_8u(t, sa, induction_bucket, 0, n);
7865        return;
7866    }
7867
7868    let mut block_start = 0usize;
7869    while block_start < n_usize {
7870        if sa[block_start] == 0 {
7871            block_start += 1;
7872        } else {
7873            let threads_usize = usize::try_from(threads)
7874                .expect("threads must be non-negative")
7875                .min(thread_state.len())
7876                .max(1);
7877            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
7878            let block_max_end = (block_start + max_span).min(n_usize);
7879            let mut block_end = block_start + 1;
7880            while block_end < block_max_end && sa[block_end] != 0 {
7881                block_end += 1;
7882            }
7883            let size = block_end - block_start;
7884
7885            if size < 32 {
7886                final_bwt_scan_left_to_right_8u(
7887                    t,
7888                    sa,
7889                    induction_bucket,
7890                    block_start as FastSint,
7891                    size as FastSint,
7892                );
7893            } else {
7894                final_bwt_scan_left_to_right_8u_block_omp(
7895                    t,
7896                    sa,
7897                    k,
7898                    induction_bucket,
7899                    block_start as FastSint,
7900                    size as FastSint,
7901                    threads,
7902                    thread_state,
7903                );
7904            }
7905            block_start = block_end;
7906        }
7907    }
7908}
7909
7910/// Internal helper: final bwt aux scan left to right 8u (OpenMP variant).
7911#[doc(hidden)]
7912pub fn final_bwt_aux_scan_left_to_right_8u_omp(
7913    t: &[u8],
7914    sa: &mut [SaSint],
7915    n: FastSint,
7916    k: SaSint,
7917    rm: SaSint,
7918    i_out: &mut [SaSint],
7919    induction_bucket: &mut [SaSint],
7920    threads: SaSint,
7921    thread_state: &mut [ThreadState],
7922) {
7923    let n_usize = usize::try_from(n).expect("n must be non-negative");
7924    let last = n_usize - 1;
7925    let bucket = t[last] as usize;
7926    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
7927    sa[slot] =
7928        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
7929    induction_bucket[bucket] += 1;
7930    if (((n as SaSint) - 1) & rm) == 0 {
7931        i_out[last / usize::try_from(rm + 1).expect("rm must allow positive step")] =
7932            induction_bucket[bucket];
7933    }
7934
7935    if threads == 1 || n < 65_536 {
7936        final_bwt_aux_scan_left_to_right_8u(t, sa, rm, i_out, induction_bucket, 0, n);
7937        return;
7938    }
7939
7940    let mut block_start = 0usize;
7941    while block_start < n_usize {
7942        if sa[block_start] == 0 {
7943            block_start += 1;
7944        } else {
7945            let threads_usize = usize::try_from(threads)
7946                .expect("threads must be non-negative")
7947                .min(thread_state.len())
7948                .max(1);
7949            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
7950            let block_max_end = (block_start + max_span).min(n_usize);
7951            let mut block_end = block_start + 1;
7952            while block_end < block_max_end && sa[block_end] != 0 {
7953                block_end += 1;
7954            }
7955            let size = block_end - block_start;
7956
7957            if size < 32 {
7958                final_bwt_aux_scan_left_to_right_8u(
7959                    t,
7960                    sa,
7961                    rm,
7962                    i_out,
7963                    induction_bucket,
7964                    block_start as FastSint,
7965                    size as FastSint,
7966                );
7967            } else {
7968                final_bwt_aux_scan_left_to_right_8u_block_omp(
7969                    t,
7970                    sa,
7971                    k,
7972                    rm,
7973                    i_out,
7974                    induction_bucket,
7975                    block_start as FastSint,
7976                    size as FastSint,
7977                    threads,
7978                    thread_state,
7979                );
7980            }
7981            block_start = block_end;
7982        }
7983    }
7984}
7985
7986/// Internal helper: final sorting scan left to right 8u (OpenMP variant).
7987#[doc(hidden)]
7988pub fn final_sorting_scan_left_to_right_8u_omp(
7989    t: &[u8],
7990    sa: &mut [SaSint],
7991    n: FastSint,
7992    k: SaSint,
7993    induction_bucket: &mut [SaSint],
7994    threads: SaSint,
7995    thread_state: &mut [ThreadState],
7996) {
7997    let n_usize = usize::try_from(n).expect("n must be non-negative");
7998    let last = n_usize - 1;
7999    let bucket = t[last] as usize;
8000    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
8001    sa[slot] =
8002        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
8003    induction_bucket[bucket] += 1;
8004
8005    if threads == 1 || n < 65_536 {
8006        final_sorting_scan_left_to_right_8u(t, sa, induction_bucket, 0, n);
8007        return;
8008    }
8009
8010    let mut block_start = 0usize;
8011    while block_start < n_usize {
8012        if sa[block_start] == 0 {
8013            block_start += 1;
8014        } else {
8015            let threads_usize = usize::try_from(threads)
8016                .expect("threads must be non-negative")
8017                .min(thread_state.len())
8018                .max(1);
8019            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
8020            let block_max_end = (block_start + max_span).min(n_usize);
8021            let mut block_end = block_start + 1;
8022            while block_end < block_max_end && sa[block_end] != 0 {
8023                block_end += 1;
8024            }
8025            let size = block_end - block_start;
8026
8027            if size < 32 {
8028                final_sorting_scan_left_to_right_8u(
8029                    t,
8030                    sa,
8031                    induction_bucket,
8032                    block_start as FastSint,
8033                    size as FastSint,
8034                );
8035            } else {
8036                final_sorting_scan_left_to_right_8u_block_omp(
8037                    t,
8038                    sa,
8039                    k,
8040                    induction_bucket,
8041                    block_start as FastSint,
8042                    size as FastSint,
8043                    threads,
8044                    thread_state,
8045                );
8046            }
8047            block_start = block_end;
8048        }
8049    }
8050}
8051
8052/// Internal helper: final sorting scan left to right 32s (OpenMP variant).
8053#[doc(hidden)]
8054pub fn final_sorting_scan_left_to_right_32s_omp(
8055    t: &[SaSint],
8056    sa: &mut [SaSint],
8057    n: SaSint,
8058    induction_bucket: &mut [SaSint],
8059    threads: SaSint,
8060    thread_state: &mut [ThreadState],
8061) {
8062    let n_usize = usize::try_from(n).expect("n must be non-negative");
8063    let last = n_usize - 1;
8064    let bucket = usize::try_from(t[last]).expect("bucket symbol must be non-negative");
8065    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
8066    sa[slot] = (n - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
8067    induction_bucket[bucket] += 1;
8068
8069    if threads == 1 || n < 65_536 {
8070        final_sorting_scan_left_to_right_32s(t, sa, induction_bucket, 0, n as FastSint);
8071        return;
8072    }
8073
8074    if thread_state.is_empty() {
8075        final_sorting_scan_left_to_right_32s(t, sa, induction_bucket, 0, n as FastSint);
8076        return;
8077    }
8078
8079    let threads_usize = usize::try_from(threads)
8080        .expect("threads must be non-negative")
8081        .max(1);
8082    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
8083    let mut block_start = 0usize;
8084    while block_start < n_usize {
8085        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(n_usize);
8086        final_sorting_scan_left_to_right_32s_block_omp(
8087            t,
8088            sa,
8089            induction_bucket,
8090            &mut cache,
8091            block_start as FastSint,
8092            (block_end - block_start) as FastSint,
8093            threads,
8094        );
8095        block_start = block_end;
8096    }
8097}
8098
8099/// Internal helper: final bwt scan right to left 8u.
8100#[doc(hidden)]
8101pub fn final_bwt_scan_right_to_left_8u(
8102    t: &[u8],
8103    sa: &mut [SaSint],
8104    induction_bucket: &mut [SaSint],
8105    omp_block_start: FastSint,
8106    omp_block_size: FastSint,
8107) -> SaSint {
8108    if omp_block_size <= 0 {
8109        return -1;
8110    }
8111
8112    let mut index = -1;
8113
8114    let start =
8115        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8116    let mut i = omp_block_start + omp_block_size - 1;
8117    let mut j = start + 1;
8118    while i >= j {
8119        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8120        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8121
8122        let mut p0 = sa[i0];
8123        if p0 == 0 {
8124            index = i0 as SaSint;
8125        }
8126        sa[i0] = p0 & SAINT_MAX;
8127        if p0 > 0 {
8128            p0 -= 1;
8129            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8130            let c0 = t[p0_usize - usize::from(p0 > 0)] as SaSint;
8131            let c1 = t[p0_usize] as SaSint;
8132            sa[i0] = c1;
8133            induction_bucket[c1 as usize] -= 1;
8134            let slot = usize::try_from(induction_bucket[c1 as usize])
8135                .expect("bucket slot must be non-negative");
8136            let marked = c0 | SAINT_MIN;
8137            sa[slot] = if c0 <= c1 { p0 } else { marked };
8138        }
8139
8140        let mut p1 = sa[i1];
8141        if p1 == 0 {
8142            index = i1 as SaSint;
8143        }
8144        sa[i1] = p1 & SAINT_MAX;
8145        if p1 > 0 {
8146            p1 -= 1;
8147            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8148            let c0 = t[p1_usize - usize::from(p1 > 0)] as SaSint;
8149            let c1 = t[p1_usize] as SaSint;
8150            sa[i1] = c1;
8151            induction_bucket[c1 as usize] -= 1;
8152            let slot = usize::try_from(induction_bucket[c1 as usize])
8153                .expect("bucket slot must be non-negative");
8154            let marked = c0 | SAINT_MIN;
8155            sa[slot] = if c0 <= c1 { p1 } else { marked };
8156        }
8157
8158        i -= 2;
8159    }
8160
8161    j -= 1;
8162    while i >= j {
8163        let idx = usize::try_from(i).expect("loop index must be non-negative");
8164        let mut p = sa[idx];
8165        if p == 0 {
8166            index = idx as SaSint;
8167        }
8168        sa[idx] = p & SAINT_MAX;
8169        if p > 0 {
8170            p -= 1;
8171            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8172            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8173            let c1 = t[p_usize] as SaSint;
8174            sa[idx] = c1;
8175            induction_bucket[c1 as usize] -= 1;
8176            let slot = usize::try_from(induction_bucket[c1 as usize])
8177                .expect("bucket slot must be non-negative");
8178            let marked = c0 | SAINT_MIN;
8179            sa[slot] = if c0 <= c1 { p } else { marked };
8180        }
8181
8182        i -= 1;
8183    }
8184
8185    index
8186}
8187
8188/// Internal helper: final bwt aux scan right to left 8u.
8189#[doc(hidden)]
8190pub fn final_bwt_aux_scan_right_to_left_8u(
8191    t: &[u8],
8192    sa: &mut [SaSint],
8193    rm: SaSint,
8194    i_out: &mut [SaSint],
8195    induction_bucket: &mut [SaSint],
8196    omp_block_start: FastSint,
8197    omp_block_size: FastSint,
8198) {
8199    if omp_block_size <= 0 {
8200        return;
8201    }
8202
8203    let start =
8204        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8205    let mut i = omp_block_start + omp_block_size - 1;
8206    let mut j = start + 1;
8207    while i >= j {
8208        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8209        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8210
8211        let mut p0 = sa[i0];
8212        sa[i0] = p0 & SAINT_MAX;
8213        if p0 > 0 {
8214            p0 -= 1;
8215            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8216            let c0 = t[p0_usize - usize::from(p0 > 0)] as SaSint;
8217            let c1 = t[p0_usize] as SaSint;
8218            sa[i0] = c1;
8219            induction_bucket[c1 as usize] -= 1;
8220            let slot = usize::try_from(induction_bucket[c1 as usize])
8221                .expect("bucket slot must be non-negative");
8222            let marked = c0 | SAINT_MIN;
8223            sa[slot] = if c0 <= c1 { p0 } else { marked };
8224            if (p0 & rm) == 0 {
8225                let out_idx =
8226                    usize::try_from(p0 / (rm + 1)).expect("sample index must be non-negative");
8227                i_out[out_idx] = induction_bucket[t[p0_usize] as usize] + 1;
8228            }
8229        }
8230
8231        let mut p1 = sa[i1];
8232        sa[i1] = p1 & SAINT_MAX;
8233        if p1 > 0 {
8234            p1 -= 1;
8235            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8236            let c0 = t[p1_usize - usize::from(p1 > 0)] as SaSint;
8237            let c1 = t[p1_usize] as SaSint;
8238            sa[i1] = c1;
8239            induction_bucket[c1 as usize] -= 1;
8240            let slot = usize::try_from(induction_bucket[c1 as usize])
8241                .expect("bucket slot must be non-negative");
8242            let marked = c0 | SAINT_MIN;
8243            sa[slot] = if c0 <= c1 { p1 } else { marked };
8244            if (p1 & rm) == 0 {
8245                let out_idx =
8246                    usize::try_from(p1 / (rm + 1)).expect("sample index must be non-negative");
8247                i_out[out_idx] = induction_bucket[t[p1_usize] as usize] + 1;
8248            }
8249        }
8250
8251        i -= 2;
8252    }
8253
8254    j -= 1;
8255    while i >= j {
8256        let idx = usize::try_from(i).expect("loop index must be non-negative");
8257        let mut p = sa[idx];
8258        sa[idx] = p & SAINT_MAX;
8259        if p > 0 {
8260            p -= 1;
8261            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8262            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8263            let c1 = t[p_usize] as SaSint;
8264            sa[idx] = c1;
8265            induction_bucket[c1 as usize] -= 1;
8266            let slot = usize::try_from(induction_bucket[c1 as usize])
8267                .expect("bucket slot must be non-negative");
8268            let marked = c0 | SAINT_MIN;
8269            sa[slot] = if c0 <= c1 { p } else { marked };
8270            if (p & rm) == 0 {
8271                let out_idx =
8272                    usize::try_from(p / (rm + 1)).expect("sample index must be non-negative");
8273                i_out[out_idx] = induction_bucket[t[p_usize] as usize] + 1;
8274            }
8275        }
8276
8277        i -= 1;
8278    }
8279}
8280
8281/// Internal helper: final sorting scan right to left 8u.
8282#[doc(hidden)]
8283pub fn final_sorting_scan_right_to_left_8u(
8284    t: &[u8],
8285    sa: &mut [SaSint],
8286    induction_bucket: &mut [SaSint],
8287    omp_block_start: FastSint,
8288    omp_block_size: FastSint,
8289) {
8290    if omp_block_size <= 0 {
8291        return;
8292    }
8293
8294    let prefetch_distance = 64usize;
8295    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8296    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8297    let mut i = start + size - 1;
8298    let mut j = start + prefetch_distance + 1;
8299
8300    while i >= j {
8301        let mut p0 = sa[i];
8302        sa[i] = p0 & SAINT_MAX;
8303        if p0 > 0 {
8304            p0 -= 1;
8305            let p0_usize = p0 as usize;
8306            let bucket0 = t[p0_usize] as usize;
8307            induction_bucket[bucket0] -= 1;
8308            let slot0 = induction_bucket[bucket0] as usize;
8309            sa[slot0] = p0
8310                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] > t[p0_usize]) as SaSint)
8311                    << (SAINT_BIT - 1));
8312        }
8313
8314        let mut p1 = sa[i - 1];
8315        sa[i - 1] = p1 & SAINT_MAX;
8316        if p1 > 0 {
8317            p1 -= 1;
8318            let p1_usize = p1 as usize;
8319            let bucket1 = t[p1_usize] as usize;
8320            induction_bucket[bucket1] -= 1;
8321            let slot1 = induction_bucket[bucket1] as usize;
8322            sa[slot1] = p1
8323                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] > t[p1_usize]) as SaSint)
8324                    << (SAINT_BIT - 1));
8325        }
8326
8327        i -= 2;
8328    }
8329
8330    j -= prefetch_distance + 1;
8331    while i >= j {
8332        let mut p = sa[i];
8333        sa[i] = p & SAINT_MAX;
8334        if p > 0 {
8335            p -= 1;
8336            let p_usize = p as usize;
8337            let bucket = t[p_usize] as usize;
8338            induction_bucket[bucket] -= 1;
8339            let slot = induction_bucket[bucket] as usize;
8340            sa[slot] = p
8341                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8342                    << (SAINT_BIT - 1));
8343        }
8344
8345        if i == 0 {
8346            break;
8347        }
8348        i -= 1;
8349    }
8350}
8351
8352/// Internal helper: final gsa scan right to left 8u.
8353#[doc(hidden)]
8354pub fn final_gsa_scan_right_to_left_8u(
8355    t: &[u8],
8356    sa: &mut [SaSint],
8357    induction_bucket: &mut [SaSint],
8358    omp_block_start: FastSint,
8359    omp_block_size: FastSint,
8360) {
8361    if omp_block_size <= 0 {
8362        return;
8363    }
8364
8365    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8366    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8367    let mut i = start + size;
8368    while i > start {
8369        i -= 1;
8370        let mut p = sa[i];
8371        sa[i] = p & SAINT_MAX;
8372        if p > 0 {
8373            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8374            if t[p_usize - 1] > 0 {
8375                p -= 1;
8376                let bucket =
8377                    t[usize::try_from(p).expect("suffix index must be non-negative")] as usize;
8378                induction_bucket[bucket] -= 1;
8379                let slot = usize::try_from(induction_bucket[bucket])
8380                    .expect("bucket slot must be non-negative");
8381                let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8382                sa[slot] = p
8383                    | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8384                        << (SAINT_BIT - 1));
8385            }
8386        }
8387    }
8388}
8389
8390/// Internal helper: final sorting scan right to left 32s.
8391#[doc(hidden)]
8392pub fn final_sorting_scan_right_to_left_32s(
8393    t: &[SaSint],
8394    sa: &mut [SaSint],
8395    induction_bucket: &mut [SaSint],
8396    omp_block_start: FastSint,
8397    omp_block_size: FastSint,
8398) {
8399    if omp_block_size <= 0 {
8400        return;
8401    }
8402
8403    let prefetch_distance: FastSint = 64;
8404    let mut i = omp_block_start + omp_block_size - 1;
8405    let mut j = omp_block_start + 2 * prefetch_distance + 1;
8406
8407    while i >= j {
8408        let i0 = i as usize;
8409        let mut p0 = sa[i0];
8410        sa[i0] = p0 & SAINT_MAX;
8411        if p0 > 0 {
8412            p0 -= 1;
8413            let p0u = p0 as usize;
8414            let bucket0 = t[p0u] as usize;
8415            induction_bucket[bucket0] -= 1;
8416            let slot0 = induction_bucket[bucket0] as usize;
8417            sa[slot0] = p0
8418                | ((usize::from(t[p0u - usize::from(p0 > 0)] > t[p0u]) as SaSint)
8419                    << (SAINT_BIT - 1));
8420        }
8421
8422        let i1 = (i - 1) as usize;
8423        let mut p1 = sa[i1];
8424        sa[i1] = p1 & SAINT_MAX;
8425        if p1 > 0 {
8426            p1 -= 1;
8427            let p1u = p1 as usize;
8428            let bucket1 = t[p1u] as usize;
8429            induction_bucket[bucket1] -= 1;
8430            let slot1 = induction_bucket[bucket1] as usize;
8431            sa[slot1] = p1
8432                | ((usize::from(t[p1u - usize::from(p1 > 0)] > t[p1u]) as SaSint)
8433                    << (SAINT_BIT - 1));
8434        }
8435        i -= 2;
8436    }
8437
8438    j -= 2 * prefetch_distance + 1;
8439    while i >= j {
8440        let iu = i as usize;
8441        let mut p = sa[iu];
8442        sa[iu] = p & SAINT_MAX;
8443        if p > 0 {
8444            p -= 1;
8445            let pu = p as usize;
8446            let bucket = t[pu] as usize;
8447            induction_bucket[bucket] -= 1;
8448            let slot = induction_bucket[bucket] as usize;
8449            sa[slot] = p
8450                | ((usize::from(t[pu - usize::from(p > 0)] > t[pu]) as SaSint) << (SAINT_BIT - 1));
8451        }
8452        i -= 1;
8453    }
8454}
8455
8456/// Internal helper: final bwt scan right to left 8u block prepare.
8457#[doc(hidden)]
8458pub fn final_bwt_scan_right_to_left_8u_block_prepare(
8459    t: &[u8],
8460    sa: &mut [SaSint],
8461    k: SaSint,
8462    buckets: &mut [SaSint],
8463    cache: &mut [ThreadCache],
8464    omp_block_start: FastSint,
8465    omp_block_size: FastSint,
8466) -> FastSint {
8467    if omp_block_size <= 0 {
8468        return 0;
8469    }
8470    let k_usize = usize::try_from(k).expect("k must be non-negative");
8471    buckets[..k_usize].fill(0);
8472    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8473    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8474    let mut count = 0usize;
8475    let mut i = start + size;
8476    while i > start {
8477        i -= 1;
8478        let mut p = sa[i];
8479        sa[i] = p & SAINT_MAX;
8480        if p > 0 {
8481            p -= 1;
8482            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8483            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8484            let c1 = t[p_usize] as SaSint;
8485            sa[i] = c1;
8486            buckets[c1 as usize] += 1;
8487            cache[count].symbol = c1;
8488            cache[count].index = if c0 <= c1 { p } else { c0 | SAINT_MIN };
8489            count += 1;
8490        }
8491    }
8492    count as FastSint
8493}
8494
8495/// Internal helper: final bwt aux scan right to left 8u block prepare.
8496#[doc(hidden)]
8497pub fn final_bwt_aux_scan_right_to_left_8u_block_prepare(
8498    t: &[u8],
8499    sa: &mut [SaSint],
8500    k: SaSint,
8501    buckets: &mut [SaSint],
8502    cache: &mut [ThreadCache],
8503    omp_block_start: FastSint,
8504    omp_block_size: FastSint,
8505) -> FastSint {
8506    if omp_block_size <= 0 {
8507        return 0;
8508    }
8509    let k_usize = usize::try_from(k).expect("k must be non-negative");
8510    buckets[..k_usize].fill(0);
8511    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8512    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8513    let mut count = 0usize;
8514    let mut i = start + size;
8515    while i > start {
8516        i -= 1;
8517        let mut p = sa[i];
8518        sa[i] = p & SAINT_MAX;
8519        if p > 0 {
8520            p -= 1;
8521            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8522            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8523            let c1 = t[p_usize] as SaSint;
8524            sa[i] = c1;
8525            buckets[c1 as usize] += 1;
8526            cache[count].symbol = c1;
8527            cache[count].index = if c0 <= c1 { p } else { c0 | SAINT_MIN };
8528            cache[count + 1].index = p;
8529            count += 2;
8530        }
8531    }
8532    count as FastSint
8533}
8534
8535/// Internal helper: final sorting scan right to left 8u block prepare.
8536#[doc(hidden)]
8537pub fn final_sorting_scan_right_to_left_8u_block_prepare(
8538    t: &[u8],
8539    sa: &mut [SaSint],
8540    k: SaSint,
8541    buckets: &mut [SaSint],
8542    cache: &mut [ThreadCache],
8543    omp_block_start: FastSint,
8544    omp_block_size: FastSint,
8545) -> FastSint {
8546    if omp_block_size <= 0 {
8547        return 0;
8548    }
8549
8550    let k_usize = usize::try_from(k).expect("k must be non-negative");
8551    buckets[..k_usize].fill(0);
8552
8553    let start =
8554        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8555    let mut i = omp_block_start + omp_block_size - 1;
8556    let mut j = start + 1;
8557    let mut count = 0usize;
8558
8559    while i >= j {
8560        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8561        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8562
8563        let mut p0 = sa[i0];
8564        sa[i0] = p0 & SAINT_MAX;
8565        if p0 > 0 {
8566            p0 -= 1;
8567            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8568            let c0 = t[p0_usize] as SaSint;
8569            buckets[c0 as usize] += 1;
8570            cache[count].symbol = c0;
8571            cache[count].index = p0
8572                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] > t[p0_usize]) as SaSint)
8573                    << (SAINT_BIT - 1));
8574            count += 1;
8575        }
8576
8577        let mut p1 = sa[i1];
8578        sa[i1] = p1 & SAINT_MAX;
8579        if p1 > 0 {
8580            p1 -= 1;
8581            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8582            let c1 = t[p1_usize] as SaSint;
8583            buckets[c1 as usize] += 1;
8584            cache[count].symbol = c1;
8585            cache[count].index = p1
8586                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] > t[p1_usize]) as SaSint)
8587                    << (SAINT_BIT - 1));
8588            count += 1;
8589        }
8590
8591        i -= 2;
8592    }
8593
8594    j -= 1;
8595    while i >= j {
8596        let idx = usize::try_from(i).expect("loop index must be non-negative");
8597        let mut p = sa[idx];
8598        sa[idx] = p & SAINT_MAX;
8599        if p > 0 {
8600            p -= 1;
8601            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8602            let c = t[p_usize] as SaSint;
8603            buckets[c as usize] += 1;
8604            cache[count].symbol = c;
8605            cache[count].index = p
8606                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8607                    << (SAINT_BIT - 1));
8608            count += 1;
8609        }
8610
8611        i -= 1;
8612    }
8613
8614    count as FastSint
8615}
8616
8617/// Internal helper: final order scan right to left 8u block place.
8618#[doc(hidden)]
8619pub fn final_order_scan_right_to_left_8u_block_place(
8620    sa: &mut [SaSint],
8621    buckets: &mut [SaSint],
8622    cache: &[ThreadCache],
8623    count: FastSint,
8624) {
8625    if count <= 0 {
8626        return;
8627    }
8628    let count_usize = usize::try_from(count).expect("count must be non-negative");
8629    for entry in &cache[..count_usize] {
8630        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
8631        buckets[symbol] -= 1;
8632        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8633        sa[slot] = entry.index;
8634    }
8635}
8636
8637/// Internal helper: final gsa scan right to left 8u block place.
8638#[doc(hidden)]
8639pub fn final_gsa_scan_right_to_left_8u_block_place(
8640    sa: &mut [SaSint],
8641    buckets: &mut [SaSint],
8642    cache: &[ThreadCache],
8643    count: FastSint,
8644) {
8645    if count <= 0 {
8646        return;
8647    }
8648    let count_usize = usize::try_from(count).expect("count must be non-negative");
8649    for entry in &cache[..count_usize] {
8650        if entry.symbol > 0 {
8651            let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
8652            buckets[symbol] -= 1;
8653            let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8654            sa[slot] = entry.index;
8655        }
8656    }
8657}
8658
8659/// Internal helper: final bwt aux scan right to left 8u block place.
8660#[doc(hidden)]
8661pub fn final_bwt_aux_scan_right_to_left_8u_block_place(
8662    sa: &mut [SaSint],
8663    rm: SaSint,
8664    i_out: &mut [SaSint],
8665    buckets: &mut [SaSint],
8666    cache: &[ThreadCache],
8667    count: FastSint,
8668) {
8669    if count <= 0 {
8670        return;
8671    }
8672    let count_usize = usize::try_from(count).expect("count must be non-negative");
8673    let mut i = 0usize;
8674    while i < count_usize {
8675        let symbol = usize::try_from(cache[i].symbol).expect("cache symbol must be non-negative");
8676        buckets[symbol] -= 1;
8677        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8678        sa[slot] = cache[i].index;
8679        if (cache[i + 1].index & rm) == 0 {
8680            let sample_index = usize::try_from((cache[i + 1].index & SAINT_MAX) / (rm + 1))
8681                .expect("sample index must be non-negative");
8682            i_out[sample_index] = buckets[symbol] + 1;
8683        }
8684        i += 2;
8685    }
8686}
8687
8688/// Internal helper: final sorting scan right to left 32s block gather.
8689#[doc(hidden)]
8690pub fn final_sorting_scan_right_to_left_32s_block_gather(
8691    t: &[SaSint],
8692    sa: &mut [SaSint],
8693    cache: &mut [ThreadCache],
8694    omp_block_start: FastSint,
8695    omp_block_size: FastSint,
8696) {
8697    if omp_block_size <= 0 {
8698        return;
8699    }
8700    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8701    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8702    for offset in 0..size {
8703        let i = start + offset;
8704        let mut symbol = SAINT_MIN;
8705        let mut p = sa[i];
8706        sa[i] = p & SAINT_MAX;
8707        if p > 0 {
8708            p -= 1;
8709            let p_usize = p as usize;
8710            cache[offset].index = p
8711                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8712                    << (SAINT_BIT - 1));
8713            symbol = t[p_usize];
8714        }
8715        cache[offset].symbol = symbol;
8716    }
8717}
8718
8719/// Internal helper: final sorting scan right to left 32s block sort.
8720#[doc(hidden)]
8721pub fn final_sorting_scan_right_to_left_32s_block_sort(
8722    t: &[SaSint],
8723    induction_bucket: &mut [SaSint],
8724    cache: &mut [ThreadCache],
8725    omp_block_start: FastSint,
8726    omp_block_size: FastSint,
8727) {
8728    if omp_block_size <= 0 {
8729        return;
8730    }
8731    let prefetch_distance = 64usize;
8732    let start = omp_block_start as usize;
8733    let mut i = start + omp_block_size as usize - 1;
8734    let mut j = start + prefetch_distance + 1;
8735
8736    while i >= j {
8737        let ci = i - start;
8738        let v0 = cache[ci].symbol;
8739        if v0 >= 0 {
8740            let bucket_index0 = v0 as usize;
8741            induction_bucket[bucket_index0] -= 1;
8742            cache[ci].symbol = induction_bucket[bucket_index0];
8743            if cache[ci].symbol >= omp_block_start as SaSint {
8744                let ni = cache[ci].symbol as usize;
8745                let cni = ni - start;
8746                let mut np = cache[ci].index;
8747                cache[ci].index = np & SAINT_MAX;
8748                if np > 0 {
8749                    np -= 1;
8750                    let np_usize = np as usize;
8751                    cache[cni].index = np
8752                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8753                            as SaSint)
8754                            << (SAINT_BIT - 1));
8755                    cache[cni].symbol = t[np_usize];
8756                }
8757            }
8758        }
8759
8760        let i1 = i - 1;
8761        let ci1 = i1 - start;
8762        let v1 = cache[ci1].symbol;
8763        if v1 >= 0 {
8764            let bucket_index1 = v1 as usize;
8765            induction_bucket[bucket_index1] -= 1;
8766            cache[ci1].symbol = induction_bucket[bucket_index1];
8767            if cache[ci1].symbol >= omp_block_start as SaSint {
8768                let ni = cache[ci1].symbol as usize;
8769                let cni = ni - start;
8770                let mut np = cache[ci1].index;
8771                cache[ci1].index = np & SAINT_MAX;
8772                if np > 0 {
8773                    np -= 1;
8774                    let np_usize = np as usize;
8775                    cache[cni].index = np
8776                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8777                            as SaSint)
8778                            << (SAINT_BIT - 1));
8779                    cache[cni].symbol = t[np_usize];
8780                }
8781            }
8782        }
8783
8784        i -= 2;
8785    }
8786
8787    j -= prefetch_distance + 1;
8788    while i >= j {
8789        let ci = i - start;
8790        let v = cache[ci].symbol;
8791        if v >= 0 {
8792            let bucket_index = v as usize;
8793            induction_bucket[bucket_index] -= 1;
8794            cache[ci].symbol = induction_bucket[bucket_index];
8795            if cache[ci].symbol >= omp_block_start as SaSint {
8796                let ni = cache[ci].symbol as usize;
8797                let cni = ni - start;
8798                let mut np = cache[ci].index;
8799                cache[ci].index = np & SAINT_MAX;
8800                if np > 0 {
8801                    np -= 1;
8802                    let np_usize = np as usize;
8803                    cache[cni].index = np
8804                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8805                            as SaSint)
8806                            << (SAINT_BIT - 1));
8807                    cache[cni].symbol = t[np_usize];
8808                }
8809            }
8810        }
8811
8812        if i == 0 {
8813            break;
8814        }
8815        i -= 1;
8816    }
8817}
8818
8819/// Internal helper: final bwt scan right to left 8u block (OpenMP variant).
8820#[doc(hidden)]
8821pub fn final_bwt_scan_right_to_left_8u_block_omp(
8822    t: &[u8],
8823    sa: &mut [SaSint],
8824    k: SaSint,
8825    induction_bucket: &mut [SaSint],
8826    block_start: FastSint,
8827    block_size: FastSint,
8828    threads: SaSint,
8829    thread_state: &mut [ThreadState],
8830) {
8831    if block_size <= 0 {
8832        return;
8833    }
8834    let k_usize = usize::try_from(k).expect("k must be non-negative");
8835    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8836    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8837    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
8838    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
8839        let _ = final_bwt_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
8840        return;
8841    }
8842
8843    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
8844    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
8845        let omp_block_start = omp_thread_num * omp_block_stride;
8846        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
8847            omp_block_stride
8848        } else {
8849            block_size_usize - omp_block_start
8850        };
8851        state.count = final_bwt_scan_right_to_left_8u_block_prepare(
8852            t,
8853            sa,
8854            k,
8855            &mut state.buckets,
8856            &mut state.cache,
8857            block_start + omp_block_start as FastSint,
8858            omp_block_size as FastSint,
8859        );
8860    }
8861    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
8862        for c in 0..k_usize {
8863            let a = induction_bucket[c];
8864            let b = state.buckets[c];
8865            induction_bucket[c] = a - b;
8866            state.buckets[c] = a;
8867        }
8868    }
8869    for state in thread_state.iter_mut().take(omp_num_threads) {
8870        final_order_scan_right_to_left_8u_block_place(
8871            sa,
8872            &mut state.buckets,
8873            &state.cache,
8874            state.count,
8875        );
8876    }
8877}
8878
8879/// Internal helper: final bwt aux scan right to left 8u block (OpenMP variant).
8880#[doc(hidden)]
8881pub fn final_bwt_aux_scan_right_to_left_8u_block_omp(
8882    t: &[u8],
8883    sa: &mut [SaSint],
8884    k: SaSint,
8885    rm: SaSint,
8886    i_out: &mut [SaSint],
8887    induction_bucket: &mut [SaSint],
8888    block_start: FastSint,
8889    block_size: FastSint,
8890    threads: SaSint,
8891    thread_state: &mut [ThreadState],
8892) {
8893    if block_size <= 0 {
8894        return;
8895    }
8896    let k_usize = usize::try_from(k).expect("k must be non-negative");
8897    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8898    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8899    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
8900    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
8901        final_bwt_aux_scan_right_to_left_8u(
8902            t,
8903            sa,
8904            rm,
8905            i_out,
8906            induction_bucket,
8907            block_start,
8908            block_size,
8909        );
8910        return;
8911    }
8912
8913    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
8914    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
8915        let omp_block_start = omp_thread_num * omp_block_stride;
8916        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
8917            omp_block_stride
8918        } else {
8919            block_size_usize - omp_block_start
8920        };
8921        state.count = final_bwt_aux_scan_right_to_left_8u_block_prepare(
8922            t,
8923            sa,
8924            k,
8925            &mut state.buckets,
8926            &mut state.cache,
8927            block_start + omp_block_start as FastSint,
8928            omp_block_size as FastSint,
8929        );
8930    }
8931    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
8932        for c in 0..k_usize {
8933            let a = induction_bucket[c];
8934            let b = state.buckets[c];
8935            induction_bucket[c] = a - b;
8936            state.buckets[c] = a;
8937        }
8938    }
8939    for state in thread_state.iter_mut().take(omp_num_threads) {
8940        final_bwt_aux_scan_right_to_left_8u_block_place(
8941            sa,
8942            rm,
8943            i_out,
8944            &mut state.buckets,
8945            &state.cache,
8946            state.count,
8947        );
8948    }
8949}
8950
8951/// Internal helper: final sorting scan right to left 8u block (OpenMP variant).
8952#[doc(hidden)]
8953pub fn final_sorting_scan_right_to_left_8u_block_omp(
8954    t: &[u8],
8955    sa: &mut [SaSint],
8956    k: SaSint,
8957    induction_bucket: &mut [SaSint],
8958    block_start: FastSint,
8959    block_size: FastSint,
8960    threads: SaSint,
8961    thread_state: &mut [ThreadState],
8962) {
8963    if block_size <= 0 {
8964        return;
8965    }
8966    let k_usize = usize::try_from(k).expect("k must be non-negative");
8967    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8968    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8969    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
8970    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
8971        final_sorting_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
8972        return;
8973    }
8974
8975    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
8976    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
8977        let omp_block_start = omp_thread_num * omp_block_stride;
8978        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
8979            omp_block_stride
8980        } else {
8981            block_size_usize - omp_block_start
8982        };
8983        state.count = final_sorting_scan_right_to_left_8u_block_prepare(
8984            t,
8985            sa,
8986            k,
8987            &mut state.buckets,
8988            &mut state.cache,
8989            block_start + omp_block_start as FastSint,
8990            omp_block_size as FastSint,
8991        );
8992    }
8993    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
8994        for c in 0..k_usize {
8995            let a = induction_bucket[c];
8996            let b = state.buckets[c];
8997            induction_bucket[c] = a - b;
8998            state.buckets[c] = a;
8999        }
9000    }
9001    for state in thread_state.iter_mut().take(omp_num_threads) {
9002        final_order_scan_right_to_left_8u_block_place(
9003            sa,
9004            &mut state.buckets,
9005            &state.cache,
9006            state.count,
9007        );
9008    }
9009}
9010
9011/// Internal helper: final gsa scan right to left 8u block (OpenMP variant).
9012#[doc(hidden)]
9013pub fn final_gsa_scan_right_to_left_8u_block_omp(
9014    t: &[u8],
9015    sa: &mut [SaSint],
9016    k: SaSint,
9017    induction_bucket: &mut [SaSint],
9018    block_start: FastSint,
9019    block_size: FastSint,
9020    threads: SaSint,
9021    thread_state: &mut [ThreadState],
9022) {
9023    if block_size <= 0 {
9024        return;
9025    }
9026    let k_usize = usize::try_from(k).expect("k must be non-negative");
9027    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
9028    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
9029    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
9030    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
9031        final_gsa_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
9032        return;
9033    }
9034
9035    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
9036    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
9037        let omp_block_start = omp_thread_num * omp_block_stride;
9038        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9039            omp_block_stride
9040        } else {
9041            block_size_usize - omp_block_start
9042        };
9043        state.count = final_sorting_scan_right_to_left_8u_block_prepare(
9044            t,
9045            sa,
9046            k,
9047            &mut state.buckets,
9048            &mut state.cache,
9049            block_start + omp_block_start as FastSint,
9050            omp_block_size as FastSint,
9051        );
9052    }
9053    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
9054        for c in 0..k_usize {
9055            let a = induction_bucket[c];
9056            let b = state.buckets[c];
9057            induction_bucket[c] = a - b;
9058            state.buckets[c] = a;
9059        }
9060    }
9061    for state in thread_state.iter_mut().take(omp_num_threads) {
9062        final_gsa_scan_right_to_left_8u_block_place(
9063            sa,
9064            &mut state.buckets,
9065            &state.cache,
9066            state.count,
9067        );
9068    }
9069}
9070
9071/// Internal helper: final sorting scan right to left 32s block (OpenMP variant).
9072#[doc(hidden)]
9073pub fn final_sorting_scan_right_to_left_32s_block_omp(
9074    t: &[SaSint],
9075    sa: &mut [SaSint],
9076    buckets: &mut [SaSint],
9077    cache: &mut [ThreadCache],
9078    block_start: FastSint,
9079    block_size: FastSint,
9080    threads: SaSint,
9081) {
9082    if threads <= 1 || block_size < 16_384 {
9083        final_sorting_scan_right_to_left_32s(t, sa, buckets, block_start, block_size);
9084        return;
9085    }
9086
9087    final_sorting_scan_right_to_left_32s_block_gather(t, sa, cache, block_start, block_size);
9088    final_sorting_scan_right_to_left_32s_block_sort(t, buckets, cache, block_start, block_size);
9089    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
9090    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
9091    let omp_num_threads = threads_usize.min(block_size_usize);
9092    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
9093    for omp_thread_num in 0..omp_num_threads {
9094        let omp_block_start = omp_thread_num * omp_block_stride;
9095        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9096            omp_block_stride
9097        } else {
9098            block_size_usize - omp_block_start
9099        };
9100        compact_and_place_cached_suffixes(
9101            sa,
9102            cache,
9103            omp_block_start as FastSint,
9104            omp_block_size as FastSint,
9105        );
9106    }
9107}
9108
9109/// Internal helper: final bwt scan right to left 8u (OpenMP variant).
9110#[doc(hidden)]
9111pub fn final_bwt_scan_right_to_left_8u_omp(
9112    t: &[u8],
9113    sa: &mut [SaSint],
9114    n: SaSint,
9115    k: SaSint,
9116    induction_bucket: &mut [SaSint],
9117    threads: SaSint,
9118    thread_state: &mut [ThreadState],
9119) -> SaSint {
9120    if threads == 1 || n < 65_536 {
9121        return final_bwt_scan_right_to_left_8u(t, sa, induction_bucket, 0, n as FastSint);
9122    }
9123    let mut index = -1;
9124    let mut block_start = usize::try_from(n).expect("n must be non-negative");
9125    while block_start > 0 {
9126        block_start -= 1;
9127        if sa[block_start] == 0 {
9128            index = block_start as SaSint;
9129        } else {
9130            let threads_usize = usize::try_from(threads)
9131                .expect("threads must be non-negative")
9132                .min(thread_state.len())
9133                .max(1);
9134            let max_back =
9135                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9136            let block_max_end = block_start.saturating_sub(max_back);
9137            let mut block_end = block_start;
9138            while block_end > block_max_end && sa[block_end - 1] != 0 {
9139                block_end -= 1;
9140            }
9141            let size = block_start - block_end + 1;
9142            if size < 32 {
9143                let res = final_bwt_scan_right_to_left_8u(
9144                    t,
9145                    sa,
9146                    induction_bucket,
9147                    block_end as FastSint,
9148                    size as FastSint,
9149                );
9150                if res >= 0 {
9151                    index = res;
9152                }
9153            } else {
9154                final_bwt_scan_right_to_left_8u_block_omp(
9155                    t,
9156                    sa,
9157                    k,
9158                    induction_bucket,
9159                    block_end as FastSint,
9160                    size as FastSint,
9161                    threads,
9162                    thread_state,
9163                );
9164            }
9165            block_start = block_end;
9166        }
9167    }
9168    index
9169}
9170
9171/// Internal helper: final bwt aux scan right to left 8u (OpenMP variant).
9172#[doc(hidden)]
9173pub fn final_bwt_aux_scan_right_to_left_8u_omp(
9174    t: &[u8],
9175    sa: &mut [SaSint],
9176    n: SaSint,
9177    k: SaSint,
9178    rm: SaSint,
9179    i_out: &mut [SaSint],
9180    induction_bucket: &mut [SaSint],
9181    threads: SaSint,
9182    thread_state: &mut [ThreadState],
9183) {
9184    if threads == 1 || n < 65_536 {
9185        final_bwt_aux_scan_right_to_left_8u(t, sa, rm, i_out, induction_bucket, 0, n as FastSint);
9186        return;
9187    }
9188    let mut block_start = usize::try_from(n).expect("n must be non-negative");
9189    while block_start > 0 {
9190        block_start -= 1;
9191        if sa[block_start] != 0 {
9192            let threads_usize = usize::try_from(threads)
9193                .expect("threads must be non-negative")
9194                .min(thread_state.len())
9195                .max(1);
9196            let max_back = threads_usize
9197                * (LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize) / 2);
9198            let block_max_end = block_start.saturating_sub(max_back);
9199            let mut block_end = block_start;
9200            while block_end > block_max_end && sa[block_end - 1] != 0 {
9201                block_end -= 1;
9202            }
9203            let size = block_start - block_end + 1;
9204            if size < 32 {
9205                final_bwt_aux_scan_right_to_left_8u(
9206                    t,
9207                    sa,
9208                    rm,
9209                    i_out,
9210                    induction_bucket,
9211                    block_end as FastSint,
9212                    size as FastSint,
9213                );
9214            } else {
9215                final_bwt_aux_scan_right_to_left_8u_block_omp(
9216                    t,
9217                    sa,
9218                    k,
9219                    rm,
9220                    i_out,
9221                    induction_bucket,
9222                    block_end as FastSint,
9223                    size as FastSint,
9224                    threads,
9225                    thread_state,
9226                );
9227            }
9228            block_start = block_end;
9229        }
9230    }
9231}
9232
9233/// Internal helper: final sorting scan right to left 8u (OpenMP variant).
9234#[doc(hidden)]
9235pub fn final_sorting_scan_right_to_left_8u_omp(
9236    t: &[u8],
9237    sa: &mut [SaSint],
9238    omp_block_start: FastSint,
9239    omp_block_size: FastSint,
9240    k: SaSint,
9241    induction_bucket: &mut [SaSint],
9242    threads: SaSint,
9243    thread_state: &mut [ThreadState],
9244) {
9245    if threads == 1 || omp_block_size < 65_536 {
9246        final_sorting_scan_right_to_left_8u(
9247            t,
9248            sa,
9249            induction_bucket,
9250            omp_block_start,
9251            omp_block_size,
9252        );
9253        return;
9254    }
9255    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9256    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9257    let mut block_start = start + size;
9258    while block_start > start {
9259        block_start -= 1;
9260        if sa[block_start] != 0 {
9261            let threads_usize = usize::try_from(threads)
9262                .expect("threads must be non-negative")
9263                .min(thread_state.len())
9264                .max(1);
9265            let max_back =
9266                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9267            let block_max_end = block_start.saturating_sub(max_back).max(start);
9268            let mut block_end = block_start;
9269            while block_end > block_max_end && sa[block_end - 1] != 0 {
9270                block_end -= 1;
9271            }
9272            let span = block_start - block_end + 1;
9273            if span < 32 {
9274                final_sorting_scan_right_to_left_8u(
9275                    t,
9276                    sa,
9277                    induction_bucket,
9278                    block_end as FastSint,
9279                    span as FastSint,
9280                );
9281            } else {
9282                final_sorting_scan_right_to_left_8u_block_omp(
9283                    t,
9284                    sa,
9285                    k,
9286                    induction_bucket,
9287                    block_end as FastSint,
9288                    span as FastSint,
9289                    threads,
9290                    thread_state,
9291                );
9292            }
9293            block_start = block_end;
9294        }
9295    }
9296}
9297
9298/// Internal helper: final gsa scan right to left 8u (OpenMP variant).
9299#[doc(hidden)]
9300pub fn final_gsa_scan_right_to_left_8u_omp(
9301    t: &[u8],
9302    sa: &mut [SaSint],
9303    omp_block_start: FastSint,
9304    omp_block_size: FastSint,
9305    k: SaSint,
9306    induction_bucket: &mut [SaSint],
9307    threads: SaSint,
9308    thread_state: &mut [ThreadState],
9309) {
9310    if threads == 1 || omp_block_size < 65_536 {
9311        final_gsa_scan_right_to_left_8u(t, sa, induction_bucket, omp_block_start, omp_block_size);
9312        return;
9313    }
9314    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9315    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9316    let mut block_start = start + size;
9317    while block_start > start {
9318        block_start -= 1;
9319        if sa[block_start] != 0 {
9320            let threads_usize = usize::try_from(threads)
9321                .expect("threads must be non-negative")
9322                .min(thread_state.len())
9323                .max(1);
9324            let max_back =
9325                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9326            let block_max_end = block_start.saturating_sub(max_back).max(start);
9327            let mut block_end = block_start;
9328            while block_end > block_max_end && sa[block_end - 1] != 0 {
9329                block_end -= 1;
9330            }
9331            let span = block_start - block_end + 1;
9332            if span < 32 {
9333                final_gsa_scan_right_to_left_8u(
9334                    t,
9335                    sa,
9336                    induction_bucket,
9337                    block_end as FastSint,
9338                    span as FastSint,
9339                );
9340            } else {
9341                final_gsa_scan_right_to_left_8u_block_omp(
9342                    t,
9343                    sa,
9344                    k,
9345                    induction_bucket,
9346                    block_end as FastSint,
9347                    span as FastSint,
9348                    threads,
9349                    thread_state,
9350                );
9351            }
9352            block_start = block_end;
9353        }
9354    }
9355}
9356
9357/// Internal helper: final sorting scan right to left 32s (OpenMP variant).
9358#[doc(hidden)]
9359pub fn final_sorting_scan_right_to_left_32s_omp(
9360    t: &[SaSint],
9361    sa: &mut [SaSint],
9362    n: SaSint,
9363    induction_bucket: &mut [SaSint],
9364    threads: SaSint,
9365    thread_state: &mut [ThreadState],
9366) {
9367    if threads == 1 || n < 65_536 {
9368        final_sorting_scan_right_to_left_32s(t, sa, induction_bucket, 0, n as FastSint);
9369        return;
9370    }
9371    if thread_state.is_empty() {
9372        final_sorting_scan_right_to_left_32s(t, sa, induction_bucket, 0, n as FastSint);
9373        return;
9374    }
9375    let threads_usize = usize::try_from(threads)
9376        .expect("threads must be non-negative")
9377        .max(1);
9378    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
9379    let mut block_start = isize::try_from(n).expect("n must fit isize") - 1;
9380    while block_start >= 0 {
9381        let block_end = (block_start
9382            - isize::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
9383                .expect("block span must fit isize"))
9384        .max(-1);
9385        final_sorting_scan_right_to_left_32s_block_omp(
9386            t,
9387            sa,
9388            induction_bucket,
9389            &mut cache,
9390            (block_end + 1) as FastSint,
9391            (block_start - block_end) as FastSint,
9392            threads,
9393        );
9394        block_start = block_end;
9395    }
9396}
9397
9398/// Internal helper: clear lms suffixes (OpenMP variant).
9399#[doc(hidden)]
9400pub fn clear_lms_suffixes_omp(
9401    sa: &mut [SaSint],
9402    n: SaSint,
9403    k: SaSint,
9404    bucket_start: &[SaSint],
9405    bucket_end: &[SaSint],
9406    threads: SaSint,
9407) {
9408    let k_usize = usize::try_from(k).expect("k must be non-negative");
9409    let thread_count = if threads > 1 && n >= 65536 {
9410        usize::try_from(threads).expect("threads must be positive")
9411    } else {
9412        1
9413    };
9414    for t in 0..thread_count {
9415        let mut c = t;
9416        while c < k_usize {
9417            if bucket_end[c] > bucket_start[c] {
9418                let start =
9419                    usize::try_from(bucket_start[c]).expect("bucket start must be non-negative");
9420                let end = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
9421                sa[start..end].fill(0);
9422            }
9423            c += thread_count;
9424        }
9425    }
9426}
9427
9428/// Internal helper: induce final order 8u (OpenMP variant).
9429#[doc(hidden)]
9430pub fn induce_final_order_8u_omp(
9431    t: &[u8],
9432    sa: &mut [SaSint],
9433    n: SaSint,
9434    k: SaSint,
9435    flags: SaSint,
9436    r: SaSint,
9437    i_out: Option<&mut [SaSint]>,
9438    buckets: &mut [SaSint],
9439    threads: SaSint,
9440    thread_state: &mut [ThreadState],
9441) -> SaSint {
9442    if (flags & LIBSAIS_FLAGS_BWT) == 0 {
9443        if (flags & LIBSAIS_FLAGS_GSA) != 0 {
9444            buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1;
9445        }
9446
9447        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9448        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9449        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9450
9451        final_sorting_scan_left_to_right_8u_omp(
9452            t,
9453            sa,
9454            n as FastSint,
9455            k,
9456            bucket_start,
9457            threads,
9458            thread_state,
9459        );
9460        if threads > 1 && n >= 65_536 {
9461            clear_lms_suffixes_omp(
9462                sa,
9463                n,
9464                ALPHABET_SIZE as SaSint,
9465                bucket_start,
9466                bucket_end,
9467                threads,
9468            );
9469        }
9470
9471        if (flags & LIBSAIS_FLAGS_GSA) != 0 {
9472            flip_suffix_markers_omp(sa, bucket_end[0], threads);
9473            final_gsa_scan_right_to_left_8u_omp(
9474                t,
9475                sa,
9476                bucket_end[0] as FastSint,
9477                n as FastSint - bucket_end[0] as FastSint,
9478                k,
9479                bucket_end,
9480                threads,
9481                thread_state,
9482            );
9483        } else {
9484            final_sorting_scan_right_to_left_8u_omp(
9485                t,
9486                sa,
9487                0,
9488                n as FastSint,
9489                k,
9490                bucket_end,
9491                threads,
9492                thread_state,
9493            );
9494        }
9495
9496        0
9497    } else if let Some(i_out) = i_out {
9498        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9499        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9500        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9501
9502        final_bwt_aux_scan_left_to_right_8u_omp(
9503            t,
9504            sa,
9505            n as FastSint,
9506            k,
9507            r - 1,
9508            i_out,
9509            bucket_start,
9510            threads,
9511            thread_state,
9512        );
9513        if threads > 1 && n >= 65_536 {
9514            clear_lms_suffixes_omp(
9515                sa,
9516                n,
9517                ALPHABET_SIZE as SaSint,
9518                bucket_start,
9519                bucket_end,
9520                threads,
9521            );
9522        }
9523        final_bwt_aux_scan_right_to_left_8u_omp(
9524            t,
9525            sa,
9526            n,
9527            k,
9528            r - 1,
9529            i_out,
9530            bucket_end,
9531            threads,
9532            thread_state,
9533        );
9534        0
9535    } else {
9536        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9537        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9538        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9539
9540        final_bwt_scan_left_to_right_8u_omp(
9541            t,
9542            sa,
9543            n as FastSint,
9544            k,
9545            bucket_start,
9546            threads,
9547            thread_state,
9548        );
9549        if threads > 1 && n >= 65_536 {
9550            clear_lms_suffixes_omp(
9551                sa,
9552                n,
9553                ALPHABET_SIZE as SaSint,
9554                bucket_start,
9555                bucket_end,
9556                threads,
9557            );
9558        }
9559        final_bwt_scan_right_to_left_8u_omp(t, sa, n, k, bucket_end, threads, thread_state)
9560    }
9561}
9562
9563/// Internal helper: induce final order 32s 6k.
9564#[doc(hidden)]
9565pub fn induce_final_order_32s_6k(
9566    t: &[SaSint],
9567    sa: &mut [SaSint],
9568    n: SaSint,
9569    k: SaSint,
9570    buckets: &mut [SaSint],
9571    threads: SaSint,
9572    thread_state: &mut [ThreadState],
9573) {
9574    let k_usize = usize::try_from(k).expect("k must be non-negative");
9575    let (_head, tail) = buckets.split_at_mut(4 * k_usize);
9576    let (left, right) = tail.split_at_mut(k_usize);
9577    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9578    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9579}
9580
9581/// Internal helper: induce final order 32s 4k.
9582#[doc(hidden)]
9583pub fn induce_final_order_32s_4k(
9584    t: &[SaSint],
9585    sa: &mut [SaSint],
9586    n: SaSint,
9587    k: SaSint,
9588    buckets: &mut [SaSint],
9589    threads: SaSint,
9590    thread_state: &mut [ThreadState],
9591) {
9592    let k_usize = usize::try_from(k).expect("k must be non-negative");
9593    let (_head, tail) = buckets.split_at_mut(2 * k_usize);
9594    let (left, right) = tail.split_at_mut(k_usize);
9595    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9596    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9597}
9598
9599/// Internal helper: induce final order 32s 2k.
9600#[doc(hidden)]
9601pub fn induce_final_order_32s_2k(
9602    t: &[SaSint],
9603    sa: &mut [SaSint],
9604    n: SaSint,
9605    k: SaSint,
9606    buckets: &mut [SaSint],
9607    threads: SaSint,
9608    thread_state: &mut [ThreadState],
9609) {
9610    let k_usize = usize::try_from(k).expect("k must be non-negative");
9611    let (right, left) = buckets.split_at_mut(k_usize);
9612    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9613    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9614}
9615
9616/// Internal helper: induce final order 32s 1k.
9617#[doc(hidden)]
9618pub fn induce_final_order_32s_1k(
9619    t: &[SaSint],
9620    sa: &mut [SaSint],
9621    n: SaSint,
9622    k: SaSint,
9623    buckets: &mut [SaSint],
9624    threads: SaSint,
9625    thread_state: &mut [ThreadState],
9626) {
9627    count_suffixes_32s(t, n, k, buckets);
9628    initialize_buckets_start_32s_1k(k, buckets);
9629    final_sorting_scan_left_to_right_32s_omp(t, sa, n, buckets, threads, thread_state);
9630
9631    count_suffixes_32s(t, n, k, buckets);
9632    initialize_buckets_end_32s_1k(k, buckets);
9633    final_sorting_scan_right_to_left_32s_omp(t, sa, n, buckets, threads, thread_state);
9634}
9635
9636/// Internal helper: renumber unique and nonunique lms suffixes 32s.
9637#[doc(hidden)]
9638pub fn renumber_unique_and_nonunique_lms_suffixes_32s(
9639    t: &mut [SaSint],
9640    sa: &mut [SaSint],
9641    m: SaSint,
9642    mut f: SaSint,
9643    omp_block_start: FastSint,
9644    omp_block_size: FastSint,
9645) -> SaSint {
9646    if omp_block_size <= 0 {
9647        return f;
9648    }
9649
9650    let prefetch_distance = 64 as SaSint;
9651    let m_usize = usize::try_from(m).expect("m must be non-negative");
9652    let (sa_head, sam) = sa.split_at_mut(m_usize);
9653    let mut i = omp_block_start as SaSint;
9654    let mut j = omp_block_start as SaSint + omp_block_size as SaSint - 2 * prefetch_distance - 3;
9655
9656    while i < j {
9657        let p0 = sa_head[i as usize] as SaUint;
9658        let p0_half = (p0 >> 1) as usize;
9659        let mut s0 = sam[p0_half];
9660        if s0 < 0 {
9661            t[p0 as usize] |= SAINT_MIN;
9662            f += 1;
9663            s0 = i + SAINT_MIN + f;
9664        }
9665        sam[p0_half] = s0 - f;
9666
9667        let p1 = sa_head[(i + 1) as usize] as SaUint;
9668        let p1_half = (p1 >> 1) as usize;
9669        let mut s1 = sam[p1_half];
9670        if s1 < 0 {
9671            t[p1 as usize] |= SAINT_MIN;
9672            f += 1;
9673            s1 = i + 1 + SAINT_MIN + f;
9674        }
9675        sam[p1_half] = s1 - f;
9676
9677        let p2 = sa_head[(i + 2) as usize] as SaUint;
9678        let p2_half = (p2 >> 1) as usize;
9679        let mut s2 = sam[p2_half];
9680        if s2 < 0 {
9681            t[p2 as usize] |= SAINT_MIN;
9682            f += 1;
9683            s2 = i + 2 + SAINT_MIN + f;
9684        }
9685        sam[p2_half] = s2 - f;
9686
9687        let p3 = sa_head[(i + 3) as usize] as SaUint;
9688        let p3_half = (p3 >> 1) as usize;
9689        let mut s3 = sam[p3_half];
9690        if s3 < 0 {
9691            t[p3 as usize] |= SAINT_MIN;
9692            f += 1;
9693            s3 = i + 3 + SAINT_MIN + f;
9694        }
9695        sam[p3_half] = s3 - f;
9696
9697        i += 4;
9698    }
9699
9700    j += 2 * prefetch_distance + 3;
9701    while i < j {
9702        let p = sa_head[i as usize] as SaUint;
9703        let p_half = (p >> 1) as usize;
9704        let mut s = sam[p_half];
9705        if s < 0 {
9706            t[p as usize] |= SAINT_MIN;
9707            f += 1;
9708            s = i + SAINT_MIN + f;
9709        }
9710        sam[p_half] = s - f;
9711        i += 1;
9712    }
9713
9714    f
9715}
9716
9717/// Internal helper: compact unique and nonunique lms suffixes 32s.
9718#[doc(hidden)]
9719pub fn compact_unique_and_nonunique_lms_suffixes_32s(
9720    sa: &mut [SaSint],
9721    m: SaSint,
9722    pl: &mut FastSint,
9723    pr: &mut FastSint,
9724    omp_block_start: FastSint,
9725    omp_block_size: FastSint,
9726) {
9727    if omp_block_size <= 0 {
9728        return;
9729    }
9730
9731    let m_usize = usize::try_from(m).expect("m must be non-negative");
9732    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9733    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9734
9735    let source: Vec<SaSint> = sa[m_usize + start..m_usize + start + size].to_vec();
9736    let mut l = usize::try_from(*pl - 1).expect("left position must be positive");
9737    let mut r = usize::try_from(*pr - 1).expect("right position must be positive");
9738
9739    for &p in source.iter().rev() {
9740        let pu = p as SaUint;
9741        sa[l] = (pu & SAINT_MAX as SaUint) as SaSint;
9742        l = l.saturating_sub(usize::from((pu as SaSint) < 0));
9743
9744        sa[r] = pu.wrapping_sub(1) as SaSint;
9745        r = r.saturating_sub(usize::from((pu as SaSint) > 0));
9746    }
9747
9748    *pl = l as FastSint + 1;
9749    *pr = r as FastSint + 1;
9750}
9751
9752/// Internal helper: count unique suffixes.
9753#[doc(hidden)]
9754pub fn count_unique_suffixes(
9755    sa: &[SaSint],
9756    m: SaSint,
9757    omp_block_start: FastSint,
9758    omp_block_size: FastSint,
9759) -> SaSint {
9760    if omp_block_size <= 0 {
9761        return 0;
9762    }
9763
9764    let m_usize = usize::try_from(m).expect("m must be non-negative");
9765    let sam = &sa[m_usize..];
9766    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9767    let block_end =
9768        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9769    let j = block_end.saturating_sub(67);
9770    let mut f0 = 0;
9771    let mut f1 = 0;
9772    let mut f2 = 0;
9773    let mut f3 = 0;
9774
9775    while i < j {
9776        f0 += SaSint::from(
9777            sam[usize::try_from((sa[i] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9778        );
9779        f1 += SaSint::from(
9780            sam[usize::try_from((sa[i + 1] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9781        );
9782        f2 += SaSint::from(
9783            sam[usize::try_from((sa[i + 2] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9784        );
9785        f3 += SaSint::from(
9786            sam[usize::try_from((sa[i + 3] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9787        );
9788        i += 4;
9789    }
9790
9791    while i < block_end {
9792        f0 += SaSint::from(
9793            sam[usize::try_from((sa[i] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9794        );
9795        i += 1;
9796    }
9797
9798    f0 + f1 + f2 + f3
9799}
9800
9801/// Internal helper: renumber unique and nonunique lms suffixes 32s (OpenMP variant).
9802#[doc(hidden)]
9803pub fn renumber_unique_and_nonunique_lms_suffixes_32s_omp(
9804    t: &mut [SaSint],
9805    sa: &mut [SaSint],
9806    m: SaSint,
9807    threads: SaSint,
9808    thread_state: &mut [ThreadState],
9809) -> SaSint {
9810    let mut f = 0;
9811    if threads == 1 || m < 65_536 {
9812        f = renumber_unique_and_nonunique_lms_suffixes_32s(t, sa, m, 0, 0, m as FastSint);
9813    } else {
9814        let threads_usize = usize::try_from(threads)
9815            .expect("threads must be non-negative")
9816            .max(1);
9817        let m_usize = usize::try_from(m).expect("m must be non-negative");
9818        let omp_num_threads = threads_usize.min(m_usize.max(1));
9819        let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
9820
9821        for omp_thread_num in 0..omp_num_threads {
9822            let omp_block_start = omp_thread_num * omp_block_stride;
9823            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9824                omp_block_stride
9825            } else {
9826                m_usize - omp_block_start
9827            };
9828
9829            thread_state[omp_thread_num].count = count_unique_suffixes(
9830                sa,
9831                m,
9832                omp_block_start as FastSint,
9833                omp_block_size as FastSint,
9834            ) as FastSint;
9835        }
9836
9837        let mut count = 0 as FastSint;
9838        for omp_thread_num in 0..omp_num_threads {
9839            let omp_block_start = omp_thread_num * omp_block_stride;
9840            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9841                omp_block_stride
9842            } else {
9843                m_usize - omp_block_start
9844            };
9845
9846            if omp_thread_num + 1 == omp_num_threads {
9847                f = (count + thread_state[omp_thread_num].count) as SaSint;
9848            }
9849
9850            renumber_unique_and_nonunique_lms_suffixes_32s(
9851                t,
9852                sa,
9853                m,
9854                count as SaSint,
9855                omp_block_start as FastSint,
9856                omp_block_size as FastSint,
9857            );
9858            count += thread_state[omp_thread_num].count;
9859        }
9860    }
9861
9862    f
9863}
9864
9865/// Internal helper: compact unique and nonunique lms suffixes 32s (OpenMP variant).
9866#[doc(hidden)]
9867pub fn compact_unique_and_nonunique_lms_suffixes_32s_omp(
9868    sa: &mut [SaSint],
9869    n: SaSint,
9870    m: SaSint,
9871    fs: SaSint,
9872    f: SaSint,
9873    threads: SaSint,
9874    thread_state: &mut [ThreadState],
9875) {
9876    let half_n = (n as FastSint) >> 1;
9877    if threads == 1 || n < 131_072 || m >= fs {
9878        let mut l = m as FastSint;
9879        let mut r = n as FastSint + fs as FastSint;
9880        compact_unique_and_nonunique_lms_suffixes_32s(sa, m, &mut l, &mut r, 0, half_n);
9881    } else {
9882        let threads_usize = usize::try_from(threads)
9883            .expect("threads must be non-negative")
9884            .max(1);
9885        let half_n_usize = usize::try_from(half_n).expect("half_n must be non-negative");
9886        let omp_num_threads = threads_usize.min(half_n_usize.max(1));
9887        let omp_block_stride = (half_n_usize / omp_num_threads) & !15usize;
9888
9889        for omp_thread_num in 0..omp_num_threads {
9890            let omp_block_start = omp_thread_num * omp_block_stride;
9891            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9892                omp_block_stride
9893            } else {
9894                half_n_usize - omp_block_start
9895            };
9896
9897            thread_state[omp_thread_num].position =
9898                m as FastSint + half_n + omp_block_start as FastSint + omp_block_size as FastSint;
9899            thread_state[omp_thread_num].count =
9900                m as FastSint + omp_block_start as FastSint + omp_block_size as FastSint;
9901
9902            let mut position = thread_state[omp_thread_num].position;
9903            let mut count = thread_state[omp_thread_num].count;
9904            compact_unique_and_nonunique_lms_suffixes_32s(
9905                sa,
9906                m,
9907                &mut position,
9908                &mut count,
9909                omp_block_start as FastSint,
9910                omp_block_size as FastSint,
9911            );
9912            thread_state[omp_thread_num].position = position;
9913            thread_state[omp_thread_num].count = count;
9914        }
9915
9916        let mut position = m as FastSint;
9917        for t in (0..omp_num_threads).rev() {
9918            let omp_block_end = if t + 1 < omp_num_threads {
9919                omp_block_stride * (t + 1)
9920            } else {
9921                half_n_usize
9922            };
9923            let count =
9924                m as FastSint + half_n + omp_block_end as FastSint - thread_state[t].position;
9925            if count > 0 {
9926                position -= count;
9927                let dst = usize::try_from(position).expect("destination must be non-negative");
9928                let src =
9929                    usize::try_from(thread_state[t].position).expect("source must be non-negative");
9930                let len = usize::try_from(count).expect("length must be non-negative");
9931                sa.copy_within(src..src + len, dst);
9932            }
9933        }
9934
9935        let mut position = n as FastSint + fs as FastSint;
9936        for t in (0..omp_num_threads).rev() {
9937            let omp_block_end = if t + 1 < omp_num_threads {
9938                omp_block_stride * (t + 1)
9939            } else {
9940                half_n_usize
9941            };
9942            let count = m as FastSint + omp_block_end as FastSint - thread_state[t].count;
9943            if count > 0 {
9944                position -= count;
9945                let dst = usize::try_from(position).expect("destination must be non-negative");
9946                let src =
9947                    usize::try_from(thread_state[t].count).expect("source must be non-negative");
9948                let len = usize::try_from(count).expect("length must be non-negative");
9949                sa.copy_within(src..src + len, dst);
9950            }
9951        }
9952    }
9953
9954    let copy_dst = usize::try_from(n + fs - m).expect("copy destination must be non-negative");
9955    let copy_src = usize::try_from(m - f).expect("copy source must be non-negative");
9956    let copy_len = usize::try_from(f).expect("copy length must be non-negative");
9957    sa.copy_within(copy_src..copy_src + copy_len, copy_dst);
9958}
9959
9960/// Internal helper: compact lms suffixes 32s (OpenMP variant).
9961#[doc(hidden)]
9962pub fn compact_lms_suffixes_32s_omp(
9963    t: &mut [SaSint],
9964    sa: &mut [SaSint],
9965    n: SaSint,
9966    m: SaSint,
9967    fs: SaSint,
9968    threads: SaSint,
9969    thread_state: &mut [ThreadState],
9970) -> SaSint {
9971    let f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(t, sa, m, threads, thread_state);
9972    compact_unique_and_nonunique_lms_suffixes_32s_omp(sa, n, m, fs, f, threads, thread_state);
9973    f
9974}
9975
9976/// Internal helper: merge unique lms suffixes 32s.
9977#[doc(hidden)]
9978pub fn merge_unique_lms_suffixes_32s(
9979    t: &mut [SaSint],
9980    sa: &mut [SaSint],
9981    n: SaSint,
9982    m: SaSint,
9983    l: FastSint,
9984    omp_block_start: FastSint,
9985    omp_block_size: FastSint,
9986) {
9987    if omp_block_size <= 0 {
9988        return;
9989    }
9990
9991    let n_usize = usize::try_from(n).expect("n must be non-negative");
9992    let m_usize = usize::try_from(m).expect("m must be non-negative");
9993    let mut src_index = n_usize - m_usize - 1 + usize::try_from(l).expect("l must be non-negative");
9994    let mut tmp = sa[src_index] as FastSint;
9995    src_index += 1;
9996
9997    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9998    let block_end =
9999        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
10000    let j = block_end.saturating_sub(6);
10001    while i < j {
10002        let c0 = t[i];
10003        if c0 < 0 {
10004            t[i] = c0 & SAINT_MAX;
10005            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint;
10006            i += 1;
10007            tmp = sa[src_index] as FastSint;
10008            src_index += 1;
10009        }
10010
10011        let c1 = t[i + 1];
10012        if c1 < 0 {
10013            t[i + 1] = c1 & SAINT_MAX;
10014            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 1;
10015            i += 1;
10016            tmp = sa[src_index] as FastSint;
10017            src_index += 1;
10018        }
10019
10020        let c2 = t[i + 2];
10021        if c2 < 0 {
10022            t[i + 2] = c2 & SAINT_MAX;
10023            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 2;
10024            i += 1;
10025            tmp = sa[src_index] as FastSint;
10026            src_index += 1;
10027        }
10028
10029        let c3 = t[i + 3];
10030        if c3 < 0 {
10031            t[i + 3] = c3 & SAINT_MAX;
10032            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 3;
10033            i += 1;
10034            tmp = sa[src_index] as FastSint;
10035            src_index += 1;
10036        }
10037
10038        i += 4;
10039    }
10040
10041    while i < block_end {
10042        let c = t[i];
10043        if c < 0 {
10044            t[i] = c & SAINT_MAX;
10045            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint;
10046            i += 1;
10047            tmp = sa[src_index] as FastSint;
10048            src_index += 1;
10049        }
10050        i += 1;
10051    }
10052}
10053
10054/// Internal helper: merge nonunique lms suffixes 32s.
10055#[doc(hidden)]
10056pub fn merge_nonunique_lms_suffixes_32s(
10057    sa: &mut [SaSint],
10058    n: SaSint,
10059    m: SaSint,
10060    l: FastSint,
10061    omp_block_start: FastSint,
10062    omp_block_size: FastSint,
10063) {
10064    if omp_block_size <= 0 {
10065        return;
10066    }
10067
10068    let n_usize = usize::try_from(n).expect("n must be non-negative");
10069    let m_usize = usize::try_from(m).expect("m must be non-negative");
10070    let mut src_index = n_usize - m_usize - 1 + usize::try_from(l).expect("l must be non-negative");
10071    let mut tmp = sa[src_index];
10072    src_index += 1;
10073
10074    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
10075    let block_end =
10076        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
10077    let j = block_end.saturating_sub(3);
10078    while i < j {
10079        if sa[i] == 0 {
10080            sa[i] = tmp;
10081            tmp = sa[src_index];
10082            src_index += 1;
10083        }
10084        if sa[i + 1] == 0 {
10085            sa[i + 1] = tmp;
10086            tmp = sa[src_index];
10087            src_index += 1;
10088        }
10089        if sa[i + 2] == 0 {
10090            sa[i + 2] = tmp;
10091            tmp = sa[src_index];
10092            src_index += 1;
10093        }
10094        if sa[i + 3] == 0 {
10095            sa[i + 3] = tmp;
10096            tmp = sa[src_index];
10097            src_index += 1;
10098        }
10099        i += 4;
10100    }
10101
10102    while i < block_end {
10103        if sa[i] == 0 {
10104            sa[i] = tmp;
10105            tmp = sa[src_index];
10106            src_index += 1;
10107        }
10108        i += 1;
10109    }
10110}
10111
10112/// Internal helper: merge unique lms suffixes 32s (OpenMP variant).
10113#[doc(hidden)]
10114pub fn merge_unique_lms_suffixes_32s_omp(
10115    t: &mut [SaSint],
10116    sa: &mut [SaSint],
10117    n: SaSint,
10118    m: SaSint,
10119    threads: SaSint,
10120    thread_state: &mut [ThreadState],
10121) {
10122    if threads == 1 || n < 65_536 {
10123        merge_unique_lms_suffixes_32s(t, sa, n, m, 0, 0, n as FastSint);
10124        return;
10125    }
10126
10127    let threads_usize = usize::try_from(threads)
10128        .expect("threads must be non-negative")
10129        .max(1);
10130    let n_usize = usize::try_from(n).expect("n must be non-negative");
10131    let omp_num_threads = threads_usize.min(n_usize.max(1));
10132    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
10133
10134    for omp_thread_num in 0..omp_num_threads {
10135        let omp_block_start = omp_thread_num * omp_block_stride;
10136        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10137            omp_block_stride
10138        } else {
10139            n_usize - omp_block_start
10140        };
10141
10142        thread_state[omp_thread_num].count = count_negative_marked_suffixes(
10143            t,
10144            omp_block_start as FastSint,
10145            omp_block_size as FastSint,
10146        ) as FastSint;
10147    }
10148
10149    let mut count = 0 as FastSint;
10150    for omp_thread_num in 0..omp_num_threads {
10151        let omp_block_start = omp_thread_num * omp_block_stride;
10152        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10153            omp_block_stride
10154        } else {
10155            n_usize - omp_block_start
10156        };
10157
10158        merge_unique_lms_suffixes_32s(
10159            t,
10160            sa,
10161            n,
10162            m,
10163            count,
10164            omp_block_start as FastSint,
10165            omp_block_size as FastSint,
10166        );
10167        count += thread_state[omp_thread_num].count;
10168    }
10169}
10170
10171/// Internal helper: merge nonunique lms suffixes 32s (OpenMP variant).
10172#[doc(hidden)]
10173pub fn merge_nonunique_lms_suffixes_32s_omp(
10174    sa: &mut [SaSint],
10175    n: SaSint,
10176    m: SaSint,
10177    f: SaSint,
10178    threads: SaSint,
10179    thread_state: &mut [ThreadState],
10180) {
10181    if threads == 1 || m < 65_536 {
10182        merge_nonunique_lms_suffixes_32s(sa, n, m, f as FastSint, 0, m as FastSint);
10183        return;
10184    }
10185
10186    let threads_usize = usize::try_from(threads)
10187        .expect("threads must be non-negative")
10188        .max(1);
10189    let m_usize = usize::try_from(m).expect("m must be non-negative");
10190    let omp_num_threads = threads_usize.min(m_usize.max(1));
10191    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
10192
10193    for omp_thread_num in 0..omp_num_threads {
10194        let omp_block_start = omp_thread_num * omp_block_stride;
10195        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10196            omp_block_stride
10197        } else {
10198            m_usize - omp_block_start
10199        };
10200
10201        thread_state[omp_thread_num].count =
10202            count_zero_marked_suffixes(sa, omp_block_start as FastSint, omp_block_size as FastSint)
10203                as FastSint;
10204    }
10205
10206    let mut count = f as FastSint;
10207    for omp_thread_num in 0..omp_num_threads {
10208        let omp_block_start = omp_thread_num * omp_block_stride;
10209        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10210            omp_block_stride
10211        } else {
10212            m_usize - omp_block_start
10213        };
10214
10215        merge_nonunique_lms_suffixes_32s(
10216            sa,
10217            n,
10218            m,
10219            count,
10220            omp_block_start as FastSint,
10221            omp_block_size as FastSint,
10222        );
10223        count += thread_state[omp_thread_num].count;
10224    }
10225}
10226
10227/// Internal helper: merge compacted lms suffixes 32s (OpenMP variant).
10228#[doc(hidden)]
10229pub fn merge_compacted_lms_suffixes_32s_omp(
10230    t: &mut [SaSint],
10231    sa: &mut [SaSint],
10232    n: SaSint,
10233    m: SaSint,
10234    f: SaSint,
10235    threads: SaSint,
10236    thread_state: &mut [ThreadState],
10237) {
10238    merge_unique_lms_suffixes_32s_omp(t, sa, n, m, threads, thread_state);
10239    merge_nonunique_lms_suffixes_32s_omp(sa, n, m, f, threads, thread_state);
10240}
10241
10242/// Internal helper: reconstruct compacted lms suffixes 32s 2k (OpenMP variant).
10243#[doc(hidden)]
10244pub fn reconstruct_compacted_lms_suffixes_32s_2k_omp(
10245    t: &mut [SaSint],
10246    sa: &mut [SaSint],
10247    n: SaSint,
10248    k: SaSint,
10249    m: SaSint,
10250    fs: SaSint,
10251    f: SaSint,
10252    buckets: &mut [SaSint],
10253    local_buckets: SaSint,
10254    threads: SaSint,
10255    thread_state: &mut [ThreadState],
10256) {
10257    if f > 0 {
10258        let dst = usize::try_from(n - m - 1).expect("destination must be non-negative");
10259        let src = usize::try_from(n + fs - m).expect("source must be non-negative");
10260        let len = usize::try_from(f).expect("length must be non-negative");
10261        sa.copy_within(src..src + len, dst);
10262
10263        let _ = count_and_gather_compacted_lms_suffixes_32s_2k_omp(
10264            t,
10265            sa,
10266            n,
10267            k,
10268            buckets,
10269            local_buckets,
10270            threads,
10271            thread_state,
10272        );
10273        reconstruct_lms_suffixes_omp(sa, n, m - f, threads);
10274
10275        let src_copy = 0usize;
10276        let dst_copy = usize::try_from(n - m - 1 + f).expect("destination must be non-negative");
10277        let copy_len = usize::try_from(m - f).expect("copy length must be non-negative");
10278        sa.copy_within(src_copy..src_copy + copy_len, dst_copy);
10279        sa[..usize::try_from(m).expect("m must be non-negative")].fill(0);
10280
10281        merge_compacted_lms_suffixes_32s_omp(t, sa, n, m, f, threads, thread_state);
10282    } else {
10283        let _ = count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
10284        reconstruct_lms_suffixes_omp(sa, n, m, threads);
10285    }
10286}
10287
10288/// Internal helper: reconstruct compacted lms suffixes 32s 1k (OpenMP variant).
10289#[doc(hidden)]
10290pub fn reconstruct_compacted_lms_suffixes_32s_1k_omp(
10291    t: &mut [SaSint],
10292    sa: &mut [SaSint],
10293    n: SaSint,
10294    m: SaSint,
10295    fs: SaSint,
10296    f: SaSint,
10297    threads: SaSint,
10298    thread_state: &mut [ThreadState],
10299) {
10300    if f > 0 {
10301        let dst = usize::try_from(n - m - 1).expect("destination must be non-negative");
10302        let src = usize::try_from(n + fs - m).expect("source must be non-negative");
10303        let len = usize::try_from(f).expect("length must be non-negative");
10304        sa.copy_within(src..src + len, dst);
10305
10306        let _ = gather_compacted_lms_suffixes_32s(t, sa, n);
10307        reconstruct_lms_suffixes_omp(sa, n, m - f, threads);
10308
10309        let dst_copy = usize::try_from(n - m - 1 + f).expect("destination must be non-negative");
10310        let copy_len = usize::try_from(m - f).expect("copy length must be non-negative");
10311        sa.copy_within(0..copy_len, dst_copy);
10312        sa[..usize::try_from(m).expect("m must be non-negative")].fill(0);
10313
10314        merge_compacted_lms_suffixes_32s_omp(t, sa, n, m, f, threads, thread_state);
10315    } else {
10316        let _ = gather_lms_suffixes_32s(t, sa, n);
10317        reconstruct_lms_suffixes_omp(sa, n, m, threads);
10318    }
10319}
10320
10321fn normalize_omp_threads(threads: SaSint) -> SaSint {
10322    if threads > 0 {
10323        threads
10324    } else {
10325        std::thread::available_parallelism()
10326            .map(|value| value.get() as SaSint)
10327            .unwrap_or(1)
10328            .max(1)
10329    }
10330}
10331
10332fn libsais64_main_32s_recursion(
10333    t_ptr: *mut SaSint,
10334    sa_ptr: *mut SaSint,
10335    sa_capacity: usize,
10336    n: SaSint,
10337    k: SaSint,
10338    fs: SaSint,
10339    threads: SaSint,
10340    thread_state: &mut [ThreadState],
10341    _local_buffer: &mut [SaSint],
10342) -> SaSint {
10343    let fs = fs.min(SAINT_MAX - n);
10344    let local_buffer_size = SaSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("fits");
10345    let n_usize = usize::try_from(n).expect("n must be non-negative");
10346    let fs_usize = usize::try_from(fs).expect("fs must be non-negative");
10347    let total_len = n_usize + fs_usize;
10348    assert!(total_len <= sa_capacity);
10349
10350    if k > 0 && n <= i32::MAX as SaSint {
10351        let int32_max = i32::MAX as SaSint;
10352        let expanded_space = fs as i128 + fs as i128 + n as i128 + n as i128;
10353        let new_fs = if expanded_space <= int32_max as i128 {
10354            fs + fs + n
10355        } else {
10356            int32_max - n
10357        };
10358
10359        if (new_fs / k >= 6)
10360            || (new_fs / k >= 4 && n <= int32_max / 2)
10361            || (new_fs / k < 4 && new_fs >= fs)
10362        {
10363            let mut t32 = unsafe { std::slice::from_raw_parts(t_ptr, n_usize) }
10364                .iter()
10365                .map(|&value| (value as u64 as u32) as i32)
10366                .collect::<Vec<_>>();
10367            let mut sa32 = vec![0i32; n_usize + new_fs as usize];
10368
10369            let index = crate::libsais_int_omp(
10370                &mut t32,
10371                &mut sa32,
10372                k as i32,
10373                new_fs as i32,
10374                threads as i32,
10375            );
10376            if index >= 0 {
10377                unsafe {
10378                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10379                    for (dst, src) in t.iter_mut().zip(t32.iter()) {
10380                        *dst = (*src as u32) as SaSint;
10381                    }
10382
10383                    let sa = std::slice::from_raw_parts_mut(sa_ptr, n_usize);
10384                    for (dst, src) in sa.iter_mut().zip(sa32.iter()) {
10385                        *dst = (*src as u32) as SaSint;
10386                    }
10387                }
10388            }
10389
10390            return index as SaSint;
10391        }
10392    }
10393
10394    if k > 0 && ((fs / k) >= 6 || (local_buffer_size / k) >= 6) {
10395        let k_usize = usize::try_from(k).expect("k must be non-negative");
10396        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 6 {
10397            1024usize
10398        } else {
10399            16usize
10400        };
10401        let need = 6 * k_usize;
10402        let use_local_buffer = local_buffer_size > fs;
10403        let buckets_ptr = if use_local_buffer {
10404            _local_buffer.as_mut_ptr()
10405        } else {
10406            unsafe {
10407                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10408                let start =
10409                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 6 {
10410                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10411                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10412                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10413                    } else {
10414                        total_len - need
10415                    };
10416                sa[start..].as_mut_ptr()
10417            }
10418        };
10419
10420        let m = unsafe {
10421            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10422            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10423            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10424            count_and_gather_lms_suffixes_32s_4k_omp(
10425                t,
10426                sa,
10427                n,
10428                k,
10429                buckets,
10430                SaSint::from(use_local_buffer),
10431                threads,
10432                thread_state,
10433            )
10434        };
10435        if m > 1 {
10436            let m_usize = usize::try_from(m).expect("m must be non-negative");
10437            unsafe {
10438                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10439                sa[..n_usize - m_usize].fill(0);
10440            }
10441
10442            let first_lms_suffix = unsafe {
10443                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10444                sa[n_usize - m_usize]
10445            };
10446            let left_suffixes_count = unsafe {
10447                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10448                initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
10449                    std::slice::from_raw_parts_mut(t_ptr, n_usize),
10450                    k,
10451                    buckets,
10452                    first_lms_suffix,
10453                )
10454            };
10455
10456            unsafe {
10457                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10458                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10459                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10460                let (_, induction_bucket) = buckets.split_at_mut(4 * k_usize);
10461                radix_sort_lms_suffixes_32s_6k_omp(
10462                    t,
10463                    sa,
10464                    n,
10465                    m,
10466                    induction_bucket,
10467                    threads,
10468                    thread_state,
10469                );
10470                if (n / 8192) < k {
10471                    radix_sort_set_markers_32s_6k_omp(sa, k, induction_bucket, threads);
10472                }
10473                if threads > 1 && n >= 65_536 {
10474                    sa[n_usize - m_usize..n_usize].fill(0);
10475                }
10476                initialize_buckets_for_partial_sorting_32s_6k(
10477                    t,
10478                    k,
10479                    buckets,
10480                    first_lms_suffix,
10481                    left_suffixes_count,
10482                );
10483                induce_partial_order_32s_6k_omp(
10484                    t,
10485                    sa,
10486                    n,
10487                    k,
10488                    buckets,
10489                    first_lms_suffix,
10490                    left_suffixes_count,
10491                    threads,
10492                    thread_state,
10493                );
10494            }
10495
10496            let names = unsafe {
10497                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10498                if (n / 8192) < k {
10499                    renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
10500                        sa,
10501                        n,
10502                        m,
10503                        threads,
10504                        thread_state,
10505                    )
10506                } else {
10507                    renumber_and_gather_lms_suffixes_omp(sa, n, m, fs, threads, thread_state)
10508                }
10509            };
10510
10511            if names < m {
10512                let f = if (n / 8192) < k {
10513                    unsafe {
10514                        let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10515                        let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10516                        compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10517                    }
10518                } else {
10519                    0
10520                };
10521
10522                let new_t_start =
10523                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10524                if libsais64_main_32s_recursion(
10525                    unsafe {
10526                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10527                            .as_mut_ptr()
10528                    },
10529                    sa_ptr,
10530                    sa_capacity,
10531                    m - f,
10532                    names - f,
10533                    fs + n - 2 * m + f,
10534                    threads,
10535                    thread_state,
10536                    _local_buffer,
10537                ) != 0
10538                {
10539                    return -2;
10540                }
10541
10542                unsafe {
10543                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10544                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10545                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10546                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10547                        t,
10548                        sa,
10549                        n,
10550                        k,
10551                        m,
10552                        fs,
10553                        f,
10554                        buckets,
10555                        SaSint::from(use_local_buffer),
10556                        threads,
10557                        thread_state,
10558                    );
10559                }
10560            } else {
10561                unsafe {
10562                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10563                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10564                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10565                }
10566            }
10567
10568            unsafe {
10569                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10570                initialize_buckets_start_and_end_32s_4k(k, buckets);
10571                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10572                place_lms_suffixes_histogram_32s_4k(sa, n, k, m, buckets);
10573                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10574                induce_final_order_32s_4k(t, sa, n, k, buckets, threads, thread_state);
10575            }
10576        } else {
10577            unsafe {
10578                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10579                sa[0] = sa[n_usize - 1];
10580            }
10581
10582            unsafe {
10583                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10584                initialize_buckets_start_and_end_32s_6k(k, buckets);
10585                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10586                place_lms_suffixes_histogram_32s_6k(sa, n, k, m, buckets);
10587                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10588                induce_final_order_32s_6k(t, sa, n, k, buckets, threads, thread_state);
10589            }
10590        }
10591
10592        return 0;
10593    } else if k > 0 && n <= SAINT_MAX / 2 && ((fs / k) >= 4 || (local_buffer_size / k) >= 4) {
10594        let k_usize = usize::try_from(k).expect("k must be non-negative");
10595        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 4 {
10596            1024usize
10597        } else {
10598            16usize
10599        };
10600        let need = 4 * k_usize;
10601        let use_local_buffer = local_buffer_size > fs;
10602        let buckets_ptr = if use_local_buffer {
10603            _local_buffer.as_mut_ptr()
10604        } else {
10605            unsafe {
10606                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10607                let start =
10608                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 4 {
10609                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10610                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10611                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10612                    } else {
10613                        total_len - need
10614                    };
10615                sa[start..].as_mut_ptr()
10616            }
10617        };
10618
10619        let m = unsafe {
10620            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10621            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10622            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10623            count_and_gather_lms_suffixes_32s_2k_omp(
10624                t,
10625                sa,
10626                n,
10627                k,
10628                buckets,
10629                SaSint::from(use_local_buffer),
10630                threads,
10631                thread_state,
10632            )
10633        };
10634        if m > 1 {
10635            let m_usize = usize::try_from(m).expect("m must be non-negative");
10636            unsafe {
10637                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10638                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10639                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10640                initialize_buckets_for_radix_and_partial_sorting_32s_4k(
10641                    t,
10642                    k,
10643                    buckets,
10644                    sa[n_usize - m_usize],
10645                );
10646                let (_, induction_bucket) = buckets.split_at_mut(1);
10647                radix_sort_lms_suffixes_32s_2k_omp(
10648                    t,
10649                    sa,
10650                    n,
10651                    m,
10652                    induction_bucket,
10653                    threads,
10654                    thread_state,
10655                );
10656                radix_sort_set_markers_32s_4k_omp(sa, k, induction_bucket, threads);
10657                place_lms_suffixes_interval_32s_4k(sa, n, k, m - 1, buckets);
10658                induce_partial_order_32s_4k_omp(t, sa, n, k, buckets, threads, thread_state);
10659            }
10660
10661            let names = unsafe {
10662                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10663                renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa, n, m, threads, thread_state)
10664            };
10665            if names < m {
10666                let f = unsafe {
10667                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10668                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10669                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10670                };
10671
10672                let new_t_start =
10673                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10674                if libsais64_main_32s_recursion(
10675                    unsafe {
10676                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10677                            .as_mut_ptr()
10678                    },
10679                    sa_ptr,
10680                    sa_capacity,
10681                    m - f,
10682                    names - f,
10683                    fs + n - 2 * m + f,
10684                    threads,
10685                    thread_state,
10686                    _local_buffer,
10687                ) != 0
10688                {
10689                    return -2;
10690                }
10691
10692                unsafe {
10693                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10694                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10695                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10696                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10697                        t,
10698                        sa,
10699                        n,
10700                        k,
10701                        m,
10702                        fs,
10703                        f,
10704                        buckets,
10705                        SaSint::from(use_local_buffer),
10706                        threads,
10707                        thread_state,
10708                    );
10709                }
10710            } else {
10711                unsafe {
10712                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10713                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10714                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10715                }
10716            }
10717        } else {
10718            unsafe {
10719                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10720                sa[0] = sa[n_usize - 1];
10721            }
10722        }
10723
10724        unsafe {
10725            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10726            initialize_buckets_start_and_end_32s_4k(k, buckets);
10727            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10728            place_lms_suffixes_histogram_32s_4k(sa, n, k, m, buckets);
10729            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10730            induce_final_order_32s_4k(t, sa, n, k, buckets, threads, thread_state);
10731        }
10732
10733        return 0;
10734    } else if k > 0 && ((fs / k) >= 2 || (local_buffer_size / k) >= 2) {
10735        let k_usize = usize::try_from(k).expect("k must be non-negative");
10736        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 2 {
10737            1024usize
10738        } else {
10739            16usize
10740        };
10741        let need = 2 * k_usize;
10742        let use_local_buffer = local_buffer_size > fs;
10743        let buckets_ptr = if use_local_buffer {
10744            _local_buffer.as_mut_ptr()
10745        } else {
10746            unsafe {
10747                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10748                let start =
10749                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 2 {
10750                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10751                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10752                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10753                    } else {
10754                        total_len - need
10755                    };
10756                sa[start..].as_mut_ptr()
10757            }
10758        };
10759
10760        let m = unsafe {
10761            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10762            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10763            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10764            count_and_gather_lms_suffixes_32s_2k_omp(
10765                t,
10766                sa,
10767                n,
10768                k,
10769                buckets,
10770                SaSint::from(use_local_buffer),
10771                threads,
10772                thread_state,
10773            )
10774        };
10775        if m > 1 {
10776            let m_usize = usize::try_from(m).expect("m must be non-negative");
10777            unsafe {
10778                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10779                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10780                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10781                initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
10782                    t,
10783                    k,
10784                    buckets,
10785                    sa[n_usize - m_usize],
10786                );
10787                let (_, induction_bucket) = buckets.split_at_mut(1);
10788                radix_sort_lms_suffixes_32s_2k_omp(
10789                    t,
10790                    sa,
10791                    n,
10792                    m,
10793                    induction_bucket,
10794                    threads,
10795                    thread_state,
10796                );
10797                place_lms_suffixes_interval_32s_2k(sa, n, k, m - 1, buckets);
10798            }
10799
10800            unsafe {
10801                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10802                initialize_buckets_start_and_end_32s_2k(k, buckets);
10803                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10804                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10805                induce_partial_order_32s_2k_omp(t, sa, n, k, buckets, threads, thread_state);
10806            }
10807
10808            let names = unsafe {
10809                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10810                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10811                renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(t, sa, n, m, threads)
10812            };
10813            if names < m {
10814                let f = unsafe {
10815                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10816                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10817                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10818                };
10819
10820                let new_t_start =
10821                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10822                if libsais64_main_32s_recursion(
10823                    unsafe {
10824                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10825                            .as_mut_ptr()
10826                    },
10827                    sa_ptr,
10828                    sa_capacity,
10829                    m - f,
10830                    names - f,
10831                    fs + n - 2 * m + f,
10832                    threads,
10833                    thread_state,
10834                    _local_buffer,
10835                ) != 0
10836                {
10837                    return -2;
10838                }
10839
10840                unsafe {
10841                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10842                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10843                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10844                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10845                        t,
10846                        sa,
10847                        n,
10848                        k,
10849                        m,
10850                        fs,
10851                        f,
10852                        buckets,
10853                        SaSint::from(use_local_buffer),
10854                        threads,
10855                        thread_state,
10856                    );
10857                }
10858            } else {
10859                unsafe {
10860                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10861                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10862                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10863                }
10864            }
10865        } else {
10866            unsafe {
10867                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10868                sa[0] = sa[n_usize - 1];
10869            }
10870        }
10871
10872        unsafe {
10873            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10874            initialize_buckets_end_32s_2k(k, buckets);
10875            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10876            place_lms_suffixes_histogram_32s_2k(sa, n, k, m, buckets);
10877        }
10878
10879        unsafe {
10880            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10881            initialize_buckets_start_and_end_32s_2k(k, buckets);
10882            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10883            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10884            induce_final_order_32s_2k(t, sa, n, k, buckets, threads, thread_state);
10885        }
10886
10887        return 0;
10888    } else {
10889        let k_usize = usize::try_from(k).expect("k must be non-negative");
10890        let mut heap_buckets = if fs < k { Some(vec![0; k_usize]) } else { None };
10891        let alignment = if fs >= 1024 && (fs - 1024) >= k {
10892            1024usize
10893        } else {
10894            16usize
10895        };
10896        let mut buckets_ptr = if let Some(ref mut heap) = heap_buckets {
10897            heap.as_mut_ptr()
10898        } else {
10899            unsafe {
10900                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10901                let start = if fs_usize >= k_usize + alignment {
10902                    let byte_ptr = sa[total_len - k_usize - alignment..].as_mut_ptr() as usize;
10903                    let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10904                    (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10905                } else {
10906                    total_len - k_usize
10907                };
10908                sa[start..].as_mut_ptr()
10909            }
10910        };
10911
10912        if buckets_ptr.is_null() {
10913            return -2;
10914        }
10915
10916        unsafe {
10917            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10918            sa[..n_usize].fill(0);
10919        }
10920
10921        unsafe {
10922            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10923            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10924            count_suffixes_32s(t, n, k, buckets);
10925        }
10926        unsafe {
10927            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10928            initialize_buckets_end_32s_1k(k, buckets);
10929        }
10930
10931        let m = unsafe {
10932            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10933            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10934            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10935            radix_sort_lms_suffixes_32s_1k(t, sa, n, buckets)
10936        };
10937        if m > 1 {
10938            unsafe {
10939                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10940                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10941                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10942                induce_partial_order_32s_1k_omp(t, sa, n, k, buckets, threads, thread_state);
10943            }
10944
10945            let names = unsafe {
10946                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10947                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10948                renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(t, sa, n, m, threads)
10949            };
10950            if names < m {
10951                if heap_buckets.is_some() {
10952                    let _ = heap_buckets.take();
10953                    buckets_ptr = std::ptr::null_mut();
10954                }
10955
10956                let f = unsafe {
10957                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10958                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10959                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10960                };
10961
10962                let new_t_start =
10963                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10964                if libsais64_main_32s_recursion(
10965                    unsafe {
10966                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10967                            .as_mut_ptr()
10968                    },
10969                    sa_ptr,
10970                    sa_capacity,
10971                    m - f,
10972                    names - f,
10973                    fs + n - 2 * m + f,
10974                    threads,
10975                    thread_state,
10976                    _local_buffer,
10977                ) != 0
10978                {
10979                    return -2;
10980                }
10981
10982                unsafe {
10983                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10984                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10985                    reconstruct_compacted_lms_suffixes_32s_1k_omp(
10986                        t,
10987                        sa,
10988                        n,
10989                        m,
10990                        fs,
10991                        f,
10992                        threads,
10993                        thread_state,
10994                    );
10995                }
10996
10997                if buckets_ptr.is_null() {
10998                    heap_buckets = Some(vec![0; k_usize]);
10999                    buckets_ptr = heap_buckets
11000                        .as_mut()
11001                        .expect("heap buckets must exist")
11002                        .as_mut_ptr();
11003                    if buckets_ptr.is_null() {
11004                        return -2;
11005                    }
11006                }
11007            }
11008
11009            unsafe {
11010                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11011                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11012                count_suffixes_32s(t, n, k, buckets);
11013            }
11014            unsafe {
11015                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11016                initialize_buckets_end_32s_1k(k, buckets);
11017            }
11018            unsafe {
11019                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11020                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
11021                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11022                place_lms_suffixes_interval_32s_1k(t, sa, k, m, buckets);
11023            }
11024        }
11025
11026        unsafe {
11027            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11028            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
11029            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11030            induce_final_order_32s_1k(t, sa, n, k, buckets, threads, thread_state);
11031        }
11032
11033        0
11034    }
11035}
11036
11037fn libsais64_main_32s_entry(
11038    t: &mut [SaSint],
11039    sa: &mut [SaSint],
11040    n: SaSint,
11041    k: SaSint,
11042    fs: SaSint,
11043    threads: SaSint,
11044    thread_state: &mut [ThreadState],
11045) -> SaSint {
11046    let mut local_buffer = [0; 2 * LIBSAIS_LOCAL_BUFFER_SIZE];
11047    libsais64_main_32s_recursion(
11048        t.as_mut_ptr(),
11049        sa.as_mut_ptr(),
11050        sa.len(),
11051        n,
11052        k,
11053        fs,
11054        threads,
11055        thread_state,
11056        &mut local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE..],
11057    )
11058}
11059
11060fn libsais64_main_8u(
11061    t: &[u8],
11062    sa: &mut [SaSint],
11063    buckets: &mut [SaSint],
11064    flags: SaSint,
11065    r: SaSint,
11066    i: Option<&mut [SaSint]>,
11067    fs: SaSint,
11068    freq: Option<&mut [SaSint]>,
11069    threads: SaSint,
11070    thread_state: &mut [ThreadState],
11071) -> SaSint {
11072    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
11073    let n_usize = usize::try_from(n).expect("n must be non-negative");
11074    let fs = fs.min(SAINT_MAX - n);
11075
11076    let m = count_and_gather_lms_suffixes_8u_omp(t, sa, n, buckets, threads, thread_state);
11077    let k = initialize_buckets_start_and_end_8u(buckets, freq);
11078
11079    if (flags & LIBSAIS_FLAGS_GSA) != 0 && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1) {
11080        return -1;
11081    }
11082
11083    if m > 0 {
11084        let m_usize = usize::try_from(m).expect("m must be non-negative");
11085        let first_lms_suffix = sa[n_usize - m_usize];
11086        let left_suffixes_count =
11087            initialize_buckets_for_lms_suffixes_radix_sort_8u(t, buckets, first_lms_suffix);
11088
11089        if threads > 1 && n >= 65_536 {
11090            sa[..n_usize - m_usize].fill(0);
11091        }
11092        radix_sort_lms_suffixes_8u_omp(t, sa, n, m, flags, buckets, threads, thread_state);
11093        if threads > 1 && n >= 65_536 {
11094            sa[n_usize - m_usize..n_usize].fill(0);
11095        }
11096
11097        initialize_buckets_for_partial_sorting_8u(
11098            t,
11099            buckets,
11100            first_lms_suffix,
11101            left_suffixes_count,
11102        );
11103        induce_partial_order_8u_omp(
11104            t,
11105            sa,
11106            n,
11107            k,
11108            flags,
11109            buckets,
11110            first_lms_suffix,
11111            left_suffixes_count,
11112            threads,
11113            thread_state,
11114        );
11115
11116        let names = renumber_and_gather_lms_suffixes_omp(sa, n, m, fs, threads, thread_state);
11117        if names < m {
11118            if libsais64_main_32s_entry(
11119                unsafe {
11120                    std::slice::from_raw_parts_mut(
11121                        sa[n_usize + usize::try_from(fs).expect("fs must be non-negative")
11122                            - m_usize..]
11123                            .as_mut_ptr(),
11124                        m_usize,
11125                    )
11126                },
11127                sa,
11128                m,
11129                names,
11130                fs + n - 2 * m,
11131                threads,
11132                thread_state,
11133            ) != 0
11134            {
11135                return -2;
11136            }
11137
11138            gather_lms_suffixes_8u_omp(t, sa, n, threads, thread_state);
11139            reconstruct_lms_suffixes_omp(sa, n, m, threads);
11140        }
11141
11142        place_lms_suffixes_interval_8u(sa, n, m, flags, buckets);
11143    } else {
11144        sa[..n_usize].fill(0);
11145    }
11146
11147    induce_final_order_8u_omp(t, sa, n, k, flags, r, i, buckets, threads, thread_state)
11148}
11149
11150fn libsais64_main(
11151    t: &[u8],
11152    sa: &mut [SaSint],
11153    flags: SaSint,
11154    r: SaSint,
11155    i: Option<&mut [SaSint]>,
11156    fs: SaSint,
11157    freq: Option<&mut [SaSint]>,
11158    threads: SaSint,
11159) -> SaSint {
11160    let threads = normalize_omp_threads(threads);
11161    if threads > 1 {
11162        let mut thread_state = match alloc_thread_state(threads) {
11163            Some(thread_state) => thread_state,
11164            None => return -2,
11165        };
11166        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
11167
11168        libsais64_main_8u(
11169            t,
11170            sa,
11171            &mut buckets,
11172            flags,
11173            r,
11174            i,
11175            fs,
11176            freq,
11177            threads,
11178            &mut thread_state,
11179        )
11180    } else {
11181        let mut thread_state = [];
11182        let mut buckets = [0; 8 * ALPHABET_SIZE];
11183
11184        libsais64_main_8u(
11185            t,
11186            sa,
11187            &mut buckets,
11188            flags,
11189            r,
11190            i,
11191            fs,
11192            freq,
11193            threads,
11194            &mut thread_state,
11195        )
11196    }
11197}
11198
11199fn libsais64_main_int(
11200    t: &mut [SaSint],
11201    sa: &mut [SaSint],
11202    k: SaSint,
11203    fs: SaSint,
11204    threads: SaSint,
11205) -> SaSint {
11206    let threads = normalize_omp_threads(threads);
11207    let mut thread_state = if threads > 1 {
11208        match alloc_thread_state(threads) {
11209            Some(thread_state) => thread_state,
11210            None => return -2,
11211        }
11212    } else {
11213        Vec::new()
11214    };
11215
11216    libsais64_main_32s_entry(
11217        t,
11218        sa,
11219        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
11220        k,
11221        fs,
11222        threads,
11223        &mut thread_state,
11224    )
11225}
11226
11227#[allow(dead_code)]
11228fn main_32s_recursion(
11229    t_ptr: *mut SaSint,
11230    sa_ptr: *mut SaSint,
11231    sa_capacity: usize,
11232    n: SaSint,
11233    k: SaSint,
11234    fs: SaSint,
11235    threads: SaSint,
11236    thread_state: &mut [ThreadState],
11237    local_buffer: &mut [SaSint],
11238) -> SaSint {
11239    libsais64_main_32s_recursion(
11240        t_ptr,
11241        sa_ptr,
11242        sa_capacity,
11243        n,
11244        k,
11245        fs,
11246        threads,
11247        thread_state,
11248        local_buffer,
11249    )
11250}
11251
11252#[allow(dead_code)]
11253fn main_32s_entry(
11254    t: &mut [SaSint],
11255    sa: &mut [SaSint],
11256    n: SaSint,
11257    k: SaSint,
11258    fs: SaSint,
11259    threads: SaSint,
11260    thread_state: &mut [ThreadState],
11261) -> SaSint {
11262    libsais64_main_32s_entry(t, sa, n, k, fs, threads, thread_state)
11263}
11264
11265#[allow(dead_code)]
11266fn main_8u(
11267    t: &[u8],
11268    sa: &mut [SaSint],
11269    buckets: &mut [SaSint],
11270    flags: SaSint,
11271    r: SaSint,
11272    i: Option<&mut [SaSint]>,
11273    fs: SaSint,
11274    freq: Option<&mut [SaSint]>,
11275    threads: SaSint,
11276    thread_state: &mut [ThreadState],
11277) -> SaSint {
11278    libsais64_main_8u(t, sa, buckets, flags, r, i, fs, freq, threads, thread_state)
11279}
11280
11281#[allow(dead_code)]
11282fn main_long(
11283    t: &mut [SaSint],
11284    sa: &mut [SaSint],
11285    k: SaSint,
11286    fs: SaSint,
11287    threads: SaSint,
11288) -> SaSint {
11289    libsais64_main_int(t, sa, k, fs, threads)
11290}
11291
11292#[allow(dead_code)]
11293fn convert_32u_to_64u(s: &[u32], d: &mut [u64], block_start: usize, block_size: usize) {
11294    for i in block_start..block_start + block_size {
11295        d[i] = s[i] as u64;
11296    }
11297}
11298
11299#[allow(dead_code)]
11300fn convert_inplace_32u_to_64u(v: &mut [u32], block_start: usize, block_size: usize) {
11301    for i in (block_start..block_start + block_size).rev() {
11302        let value = v[i];
11303        v[2 * i] = value;
11304        v[2 * i + 1] = 0;
11305    }
11306}
11307
11308#[allow(dead_code)]
11309fn convert_inplace_64u_to_32u(v: &mut [u32], block_start: usize, block_size: usize) {
11310    for i in block_start..block_start + block_size {
11311        v[i] = v[2 * i];
11312    }
11313}
11314
11315#[allow(dead_code)]
11316fn convert_inplace_32u_to_64u_omp(v: &mut [u32], n: SaSint, threads: SaSint) {
11317    let mut n = usize::try_from(n).expect("n must be non-negative");
11318    let threads = usize::try_from(threads.max(1)).expect("threads must be non-negative");
11319
11320    while n >= 65_536 {
11321        let block_size = n >> 1;
11322        n -= block_size;
11323
11324        let omp_block_stride = (block_size / threads) & !15usize;
11325        for thread in 0..threads {
11326            let block_start = thread * omp_block_stride;
11327            let size = if thread + 1 < threads {
11328                omp_block_stride
11329            } else {
11330                block_size - block_start
11331            };
11332            convert_inplace_32u_to_64u(v, n + block_start, size);
11333        }
11334    }
11335
11336    convert_inplace_32u_to_64u(v, 0, n);
11337}
11338
11339fn libsais64_main_ctx(
11340    ctx: &mut Context,
11341    t: &[u8],
11342    sa: &mut [SaSint],
11343    flags: SaSint,
11344    r: SaSint,
11345    i: Option<&mut [SaSint]>,
11346    fs: SaSint,
11347    freq: Option<&mut [SaSint]>,
11348) -> SaSint {
11349    if ctx.threads <= 0 || ctx.buckets.len() != 8 * ALPHABET_SIZE {
11350        return -2;
11351    }
11352
11353    let mut empty_thread_state = [];
11354    let thread_state = if ctx.threads > 1 {
11355        match ctx.thread_state.as_deref_mut() {
11356            Some(thread_state) if thread_state.len() >= ctx.threads as usize => thread_state,
11357            None => return -2,
11358            Some(_) => return -2,
11359        }
11360    } else {
11361        &mut empty_thread_state
11362    };
11363
11364    libsais64_main_8u(
11365        t,
11366        sa,
11367        &mut ctx.buckets,
11368        flags,
11369        r,
11370        i,
11371        fs,
11372        freq,
11373        ctx.threads as SaSint,
11374        thread_state,
11375    )
11376}
11377
11378/// Constructs the suffix array of a given string.
11379///
11380/// - `t` (`[0..n-1]`): the input string.
11381/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11382/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11383/// - `freq` (`[0..255]`): optional output symbol frequency table.
11384///
11385/// Returns 0 on success, -1 or -2 on error.
11386pub fn libsais64(t: &[u8], sa: &mut [SaSint], fs: SaSint, freq: Option<&mut [SaSint]>) -> SaSint {
11387    if fs < 0
11388        || sa.len()
11389            < t.len()
11390                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11391    {
11392        return -1;
11393    }
11394    if let Some(freq) = freq.as_ref() {
11395        if freq.len() < ALPHABET_SIZE {
11396            return -1;
11397        }
11398    }
11399
11400    let n = t.len();
11401    if n <= 1 {
11402        if let Some(freq) = freq {
11403            freq[..ALPHABET_SIZE].fill(0);
11404            if n == 1 {
11405                freq[t[0] as usize] += 1;
11406            }
11407        }
11408        if n == 1 {
11409            sa[0] = 0;
11410        }
11411        return 0;
11412    }
11413
11414    if n <= i32::MAX as usize {
11415        return libsais64_run_32bit_omp(t, sa, fs, freq, 1, false)
11416            .expect("n <= INT32_MAX must have 32-bit workspace");
11417    }
11418
11419    libsais64_main(t, sa, LIBSAIS_FLAGS_NONE, 0, None, fs, freq, 1)
11420}
11421
11422#[cfg(feature = "upstream-c")]
11423unsafe extern "C" {
11424    fn probe_public_libsais64_omp_freq(
11425        t: *const u8,
11426        sa: *mut SaSint,
11427        n: SaSint,
11428        fs: SaSint,
11429        freq: *mut SaSint,
11430        threads: SaSint,
11431    ) -> SaSint;
11432}
11433
11434/// Wrapper around the bundled upstream C `libsais64_omp` implementation.
11435///
11436/// Available only with the `upstream-c` feature. Same semantics as the Rust [`libsais64_omp`] function but defers all work to the C library; intended for the differential test suite and benchmarks.
11437///
11438/// - `t` (`[0..n-1]`): the input string.
11439/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11440/// - `fs`: extra space available at the end of `sa`.
11441/// - `freq` (`[0..255]`): optional output symbol frequency table.
11442/// - `threads`: number of worker threads (can be 0 for the implementation default).
11443///
11444/// Returns 0 on success, -1 or -2 on error.
11445#[cfg(feature = "upstream-c")]
11446pub fn libsais64_upstream_c_omp(
11447    t: &[u8],
11448    sa: &mut [SaSint],
11449    fs: SaSint,
11450    freq: Option<&mut [SaSint]>,
11451    threads: SaSint,
11452) -> SaSint {
11453    if threads < 0 {
11454        return -1;
11455    }
11456    if fs < 0
11457        || t.len() > SaSint::MAX as usize
11458        || sa.len()
11459            < t.len()
11460                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11461    {
11462        return -1;
11463    }
11464    if let Some(freq) = freq.as_ref() {
11465        if freq.len() < ALPHABET_SIZE {
11466            return -1;
11467        }
11468    }
11469
11470    let n = t.len() as SaSint;
11471    let freq_ptr = freq.map_or(std::ptr::null_mut(), |freq| freq.as_mut_ptr());
11472    unsafe {
11473        probe_public_libsais64_omp_freq(
11474            t.as_ptr(),
11475            sa.as_mut_ptr(),
11476            n,
11477            fs,
11478            freq_ptr,
11479            threads.max(1),
11480        )
11481    }
11482}
11483
11484/// `MaybeUninit` variant of [`libsais64_upstream_c_omp`].
11485///
11486/// Available only with the `upstream-c` feature. Identical semantics, except that the `sa` output may be uninitialised on entry — the C implementation always writes every required slot, so the caller may treat the slice as initialised on success.
11487#[cfg(feature = "upstream-c")]
11488pub fn libsais64_upstream_c_omp_uninit(
11489    t: &[u8],
11490    sa: &mut [MaybeUninit<SaSint>],
11491    fs: SaSint,
11492    freq: Option<&mut [SaSint]>,
11493    threads: SaSint,
11494) -> SaSint {
11495    if threads < 0 {
11496        return -1;
11497    }
11498    if fs < 0
11499        || t.len() > SaSint::MAX as usize
11500        || sa.len()
11501            < t.len()
11502                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11503    {
11504        return -1;
11505    }
11506    if let Some(freq) = freq.as_ref() {
11507        if freq.len() < ALPHABET_SIZE {
11508            return -1;
11509        }
11510    }
11511
11512    let n = t.len() as SaSint;
11513    let freq_ptr = freq.map_or(std::ptr::null_mut(), |freq| freq.as_mut_ptr());
11514    unsafe {
11515        probe_public_libsais64_omp_freq(
11516            t.as_ptr(),
11517            sa.as_mut_ptr().cast::<SaSint>(),
11518            n,
11519            fs,
11520            freq_ptr,
11521            threads.max(1),
11522        )
11523    }
11524}
11525
11526/// Constructs the generalized suffix array (GSA) of a given string set.
11527///
11528/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
11529/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11530/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11531/// - `freq` (`[0..255]`): optional output symbol frequency table.
11532///
11533/// Returns 0 on success, -1 or -2 on error.
11534pub fn libsais64_gsa(
11535    t: &[u8],
11536    sa: &mut [SaSint],
11537    fs: SaSint,
11538    freq: Option<&mut [SaSint]>,
11539) -> SaSint {
11540    if fs < 0
11541        || sa.len()
11542            < t.len()
11543                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11544    {
11545        return -1;
11546    }
11547    if let Some(freq) = freq.as_ref() {
11548        if freq.len() < ALPHABET_SIZE {
11549            return -1;
11550        }
11551    }
11552
11553    let n = t.len();
11554    if n > 0 && t[n - 1] != 0 {
11555        return -1;
11556    }
11557
11558    if n <= 1 {
11559        if let Some(freq) = freq {
11560            freq[..ALPHABET_SIZE].fill(0);
11561            if n == 1 {
11562                freq[t[0] as usize] += 1;
11563            }
11564        }
11565        if n == 1 {
11566            sa[0] = 0;
11567        }
11568        return 0;
11569    }
11570
11571    if n <= i32::MAX as usize {
11572        return libsais64_run_32bit_omp(t, sa, fs, freq, 1, true)
11573            .expect("n <= INT32_MAX must have 32-bit workspace");
11574    }
11575
11576    libsais64_main(t, sa, LIBSAIS_FLAGS_GSA, 0, None, fs, freq, 1)
11577}
11578
11579/// Alias for `libsais64_long`. See its documentation.
11580pub fn libsais64_int(t: &mut [SaSint], sa: &mut [SaSint], k: SaSint, fs: SaSint) -> SaSint {
11581    if fs < 0
11582        || sa.len()
11583            < t.len()
11584                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11585    {
11586        return -1;
11587    }
11588
11589    if t.len() <= 1 {
11590        if t.len() == 1 {
11591            sa[0] = 0;
11592        }
11593        return 0;
11594    }
11595
11596    libsais64_main_int(t, sa, k, fs, 1)
11597}
11598
11599/// Constructs the suffix array of a given integer array.
11600///
11601/// During construction the input array is modified, but restored at the end if no error occurred.
11602///
11603/// - `t` (`[0..n-1]`): the input integer array.
11604/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11605/// - `k`: the alphabet size of the input integer array.
11606/// - `fs`: extra space available at the end of `sa` (can be 0, but 4k or better 6k is recommended for optimal performance).
11607///
11608/// Returns 0 on success, -1 or -2 on error.
11609pub fn libsais64_long(t: &mut [SaSint], sa: &mut [SaSint], k: SaSint, fs: SaSint) -> SaSint {
11610    libsais64_int(t, sa, k, fs)
11611}
11612
11613/// Constructs the suffix array of a given string using a libsais64 context.
11614///
11615/// - `ctx`: the libsais64 context.
11616/// - `t` (`[0..n-1]`): the input string.
11617/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11618/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11619/// - `freq` (`[0..255]`): optional output symbol frequency table.
11620///
11621/// Returns 0 on success, -1 or -2 on error.
11622pub fn libsais64_ctx(
11623    ctx: &mut Context,
11624    t: &[u8],
11625    sa: &mut [SaSint],
11626    fs: SaSint,
11627    freq: Option<&mut [SaSint]>,
11628) -> SaSint {
11629    if fs < 0
11630        || sa.len()
11631            < t.len()
11632                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11633    {
11634        return -1;
11635    }
11636    if let Some(freq) = freq.as_ref() {
11637        if freq.len() < ALPHABET_SIZE {
11638            return -1;
11639        }
11640    }
11641
11642    let n = t.len();
11643    if n <= 1 {
11644        if let Some(freq) = freq {
11645            freq[..ALPHABET_SIZE].fill(0);
11646            if n == 1 {
11647                freq[t[0] as usize] += 1;
11648            }
11649        }
11650        if n == 1 {
11651            sa[0] = 0;
11652        }
11653        return 0;
11654    }
11655
11656    libsais64_main_ctx(ctx, t, sa, LIBSAIS_FLAGS_NONE, 0, None, fs, freq)
11657}
11658
11659/// Constructs the generalized suffix array (GSA) of a given string set using a libsais64 context.
11660///
11661/// - `ctx`: the libsais64 context.
11662/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
11663/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11664/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11665/// - `freq` (`[0..255]`): optional output symbol frequency table.
11666///
11667/// Returns 0 on success, -1 or -2 on error.
11668pub fn libsais64_gsa_ctx(
11669    ctx: &mut Context,
11670    t: &[u8],
11671    sa: &mut [SaSint],
11672    fs: SaSint,
11673    freq: Option<&mut [SaSint]>,
11674) -> SaSint {
11675    if fs < 0
11676        || sa.len()
11677            < t.len()
11678                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11679    {
11680        return -1;
11681    }
11682    if let Some(freq) = freq.as_ref() {
11683        if freq.len() < ALPHABET_SIZE {
11684            return -1;
11685        }
11686    }
11687
11688    let n = t.len();
11689    if n > 0 && t[n - 1] != 0 {
11690        return -1;
11691    }
11692
11693    if n <= 1 {
11694        if let Some(freq) = freq {
11695            freq[..ALPHABET_SIZE].fill(0);
11696            if n == 1 {
11697                freq[t[0] as usize] += 1;
11698            }
11699        }
11700        if n == 1 {
11701            sa[0] = 0;
11702        }
11703        return 0;
11704    }
11705
11706    libsais64_main_ctx(ctx, t, sa, LIBSAIS_FLAGS_GSA, 0, None, fs, freq)
11707}
11708
11709/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string.
11710///
11711/// - `t` (`[0..n-1]`): the input string.
11712/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11713/// - `a` (`[0..n-1+fs]`): the temporary array.
11714/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11715/// - `freq` (`[0..255]`): optional output symbol frequency table.
11716///
11717/// Returns the primary index on success, -1 or -2 on error.
11718pub fn libsais64_bwt(
11719    t: &[u8],
11720    u: &mut [u8],
11721    a: &mut [SaSint],
11722    fs: SaSint,
11723    freq: Option<&mut [SaSint]>,
11724) -> SaSint {
11725    if fs < 0
11726        || u.len() < t.len()
11727        || a.len()
11728            < t.len()
11729                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11730    {
11731        return -1;
11732    }
11733    if let Some(freq) = freq.as_ref() {
11734        if freq.len() < ALPHABET_SIZE {
11735            return -1;
11736        }
11737    }
11738
11739    let n = t.len();
11740    if n <= 1 {
11741        if let Some(freq) = freq {
11742            freq[..ALPHABET_SIZE].fill(0);
11743            if n == 1 {
11744                u[0] = t[0];
11745                freq[t[0] as usize] += 1;
11746            }
11747        } else if n == 1 {
11748            u[0] = t[0];
11749        }
11750        return n as SaSint;
11751    }
11752
11753    if n <= i32::MAX as usize {
11754        return libsais64_bwt_run_32bit_omp(t, u, fs, freq, 1)
11755            .expect("n <= INT32_MAX must have 32-bit workspace");
11756    }
11757
11758    let mut index = libsais64_main(t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq, 1);
11759    if index >= 0 {
11760        index += 1;
11761        let split = usize::try_from(index).expect("index must be non-negative");
11762        u[0] = t[n - 1];
11763        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], index - 1, 1);
11764        bwt_copy_8u_omp(
11765            &mut u[split..n],
11766            &a[split..n],
11767            SaSint::try_from(n - split).expect("fits"),
11768            1,
11769        );
11770    }
11771    index
11772}
11773
11774/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string with auxiliary indexes.
11775///
11776/// - `t` (`[0..n-1]`): the input string.
11777/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11778/// - `a` (`[0..n-1+fs]`): the temporary array.
11779/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11780/// - `freq` (`[0..255]`): optional output symbol frequency table.
11781/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
11782/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
11783///
11784/// Returns 0 on success, -1 or -2 on error.
11785pub fn libsais64_bwt_aux(
11786    t: &[u8],
11787    u: &mut [u8],
11788    a: &mut [SaSint],
11789    fs: SaSint,
11790    freq: Option<&mut [SaSint]>,
11791    r: SaSint,
11792    i: &mut [SaSint],
11793) -> SaSint {
11794    let n = t.len();
11795    if fs < 0
11796        || r < 2
11797        || (r & (r - 1)) != 0
11798        || u.len() < n
11799        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11800        || freq.as_ref().is_some_and(|freq| freq.len() < ALPHABET_SIZE)
11801    {
11802        return -1;
11803    }
11804    let sample_count = if n == 0 {
11805        1
11806    } else {
11807        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
11808            .expect("sample count must be non-negative")
11809            + 1
11810    };
11811    if i.len() < sample_count {
11812        return -1;
11813    }
11814
11815    if n <= 1 {
11816        if let Some(freq) = freq {
11817            freq[..ALPHABET_SIZE].fill(0);
11818            if n == 1 {
11819                u[0] = t[0];
11820                freq[t[0] as usize] += 1;
11821            }
11822        } else if n == 1 {
11823            u[0] = t[0];
11824        }
11825        i[0] = n as SaSint;
11826        return 0;
11827    }
11828
11829    if n <= i32::MAX as usize && r <= i32::MAX as SaSint {
11830        return libsais64_bwt_aux_run_32bit_omp(t, u, fs, freq, r, i, 1)
11831            .expect("n/r <= INT32_MAX must have 32-bit workspace");
11832    }
11833
11834    let index = libsais64_main(t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq, 1);
11835    if index == 0 {
11836        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
11837        u[0] = t[n - 1];
11838        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], i[0] - 1, 1);
11839        bwt_copy_8u_omp(
11840            &mut u[split..n],
11841            &a[split..n],
11842            SaSint::try_from(n - split).expect("fits"),
11843            1,
11844        );
11845    }
11846    index
11847}
11848
11849/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string using a libsais64 context.
11850///
11851/// - `ctx`: the libsais64 context.
11852/// - `t` (`[0..n-1]`): the input string.
11853/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11854/// - `a` (`[0..n-1+fs]`): the temporary array.
11855/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11856/// - `freq` (`[0..255]`): optional output symbol frequency table.
11857///
11858/// Returns the primary index on success, -1 or -2 on error.
11859pub fn libsais64_bwt_ctx(
11860    ctx: &mut Context,
11861    t: &[u8],
11862    u: &mut [u8],
11863    a: &mut [SaSint],
11864    fs: SaSint,
11865    freq: Option<&mut [SaSint]>,
11866) -> SaSint {
11867    if fs < 0
11868        || u.len() < t.len()
11869        || a.len()
11870            < t.len()
11871                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11872    {
11873        return -1;
11874    }
11875    if let Some(freq) = freq.as_ref() {
11876        if freq.len() < ALPHABET_SIZE {
11877            return -1;
11878        }
11879    }
11880
11881    let n = t.len();
11882    if n <= 1 {
11883        if let Some(freq) = freq {
11884            freq[..ALPHABET_SIZE].fill(0);
11885            if n == 1 {
11886                u[0] = t[0];
11887                freq[t[0] as usize] += 1;
11888            }
11889        } else if n == 1 {
11890            u[0] = t[0];
11891        }
11892        return n as SaSint;
11893    }
11894
11895    let mut index = libsais64_main_ctx(ctx, t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq);
11896    if index >= 0 {
11897        index += 1;
11898        let split = usize::try_from(index).expect("index must be non-negative");
11899        u[0] = t[n - 1];
11900        bwt_copy_8u_omp(
11901            &mut u[1..split],
11902            &a[..split - 1],
11903            index - 1,
11904            ctx.threads as SaSint,
11905        );
11906        bwt_copy_8u_omp(
11907            &mut u[split..n],
11908            &a[split..n],
11909            SaSint::try_from(n - split).expect("fits"),
11910            ctx.threads as SaSint,
11911        );
11912    }
11913    index
11914}
11915
11916/// Constructs the BWT of a given string with auxiliary indexes using a libsais64 context.
11917///
11918/// - `ctx`: the libsais64 context.
11919/// - `t` (`[0..n-1]`): the input string.
11920/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11921/// - `a` (`[0..n-1+fs]`): the temporary array.
11922/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11923/// - `freq` (`[0..255]`): optional output symbol frequency table.
11924/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
11925/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
11926///
11927/// Returns 0 on success, -1 or -2 on error.
11928pub fn libsais64_bwt_aux_ctx(
11929    ctx: &mut Context,
11930    t: &[u8],
11931    u: &mut [u8],
11932    a: &mut [SaSint],
11933    fs: SaSint,
11934    freq: Option<&mut [SaSint]>,
11935    r: SaSint,
11936    i: &mut [SaSint],
11937) -> SaSint {
11938    let n = t.len();
11939    if fs < 0
11940        || r < 2
11941        || (r & (r - 1)) != 0
11942        || u.len() < n
11943        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11944    {
11945        return -1;
11946    }
11947    if let Some(freq) = freq.as_ref() {
11948        if freq.len() < ALPHABET_SIZE {
11949            return -1;
11950        }
11951    }
11952    let sample_count = if n == 0 {
11953        1
11954    } else {
11955        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
11956            .expect("sample count must be non-negative")
11957            + 1
11958    };
11959    if i.len() < sample_count {
11960        return -1;
11961    }
11962
11963    if n <= 1 {
11964        if let Some(freq) = freq {
11965            freq[..ALPHABET_SIZE].fill(0);
11966            if n == 1 {
11967                u[0] = t[0];
11968                freq[t[0] as usize] += 1;
11969            }
11970        } else if n == 1 {
11971            u[0] = t[0];
11972        }
11973        i[0] = n as SaSint;
11974        return 0;
11975    }
11976
11977    let index = libsais64_main_ctx(ctx, t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq);
11978    if index == 0 {
11979        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
11980        u[0] = t[n - 1];
11981        bwt_copy_8u_omp(
11982            &mut u[1..split],
11983            &a[..split - 1],
11984            i[0] - 1,
11985            ctx.threads as SaSint,
11986        );
11987        bwt_copy_8u_omp(
11988            &mut u[split..n],
11989            &a[split..n],
11990            SaSint::try_from(n - split).expect("fits"),
11991            ctx.threads as SaSint,
11992        );
11993    }
11994    index
11995}
11996
11997/// Creates the libsais64 context for parallel operations using OpenMP-style threading.
11998///
11999/// In multi-threaded environments, use one context per thread for parallel executions.
12000///
12001/// - `threads`: number of worker threads (can be 0 for the implementation default).
12002///
12003/// Returns the context, or `None` on allocation failure.
12004pub fn create_ctx_omp(threads: SaSint) -> Option<Context> {
12005    if threads < 0 {
12006        return None;
12007    }
12008
12009    create_ctx_main(normalize_omp_threads(threads))
12010}
12011
12012fn libsais64_new_32bit_fs(n: usize, fs: SaSint) -> Option<i32> {
12013    if n > i32::MAX as usize {
12014        return None;
12015    }
12016
12017    let n = n as SaSint;
12018    let int32_max = i32::MAX as SaSint;
12019    let expanded_space = fs as i128 + fs as i128 + n as i128 + n as i128;
12020    let new_fs = if expanded_space <= int32_max as i128 {
12021        fs + fs + n
12022    } else {
12023        int32_max - n
12024    };
12025
12026    i32::try_from(new_fs).ok()
12027}
12028
12029fn libsais64_run_32bit_omp(
12030    t: &[u8],
12031    sa: &mut [SaSint],
12032    fs: SaSint,
12033    freq: Option<&mut [SaSint]>,
12034    threads: SaSint,
12035    gsa: bool,
12036) -> Option<SaSint> {
12037    let new_fs = libsais64_new_32bit_fs(t.len(), fs)?;
12038    let mut sa32 = vec![0i32; t.len() + usize::try_from(new_fs).expect("new_fs is non-negative")];
12039
12040    let index = if let Some(freq) = freq {
12041        let mut freq32 = vec![0i32; ALPHABET_SIZE];
12042        let index = if gsa {
12043            crate::libsais_gsa_omp(t, &mut sa32, new_fs, Some(&mut freq32), threads as i32)
12044        } else {
12045            crate::libsais_omp(t, &mut sa32, new_fs, Some(&mut freq32), threads as i32)
12046        };
12047        if index >= 0 {
12048            for (dst, src) in freq.iter_mut().zip(freq32.iter()) {
12049                *dst = SaSint::from(*src);
12050            }
12051        }
12052        index
12053    } else if gsa {
12054        crate::libsais_gsa_omp(t, &mut sa32, new_fs, None, threads as i32)
12055    } else {
12056        crate::libsais_omp(t, &mut sa32, new_fs, None, threads as i32)
12057    };
12058
12059    if index >= 0 {
12060        for (dst, src) in sa.iter_mut().zip(sa32.iter()).take(t.len()) {
12061            *dst = SaSint::from(*src as u32);
12062        }
12063    }
12064
12065    Some(SaSint::from(index))
12066}
12067
12068fn copy_freq32_to_64(freq: &mut [SaSint], freq32: &[i32]) {
12069    for (dst, src) in freq.iter_mut().zip(freq32.iter()).take(ALPHABET_SIZE) {
12070        *dst = SaSint::from(*src);
12071    }
12072}
12073
12074fn libsais64_bwt_run_32bit_omp(
12075    t: &[u8],
12076    u: &mut [u8],
12077    fs: SaSint,
12078    freq: Option<&mut [SaSint]>,
12079    threads: SaSint,
12080) -> Option<SaSint> {
12081    let new_fs = libsais64_new_32bit_fs(t.len(), fs)?;
12082    let mut a32 = vec![0i32; t.len() + usize::try_from(new_fs).expect("new_fs is non-negative")];
12083
12084    let index = if let Some(freq) = freq {
12085        let mut freq32 = vec![0i32; ALPHABET_SIZE];
12086        let index =
12087            crate::libsais_bwt_omp(t, u, &mut a32, new_fs, Some(&mut freq32), threads as i32);
12088        if index >= 0 {
12089            copy_freq32_to_64(freq, &freq32);
12090        }
12091        index
12092    } else {
12093        crate::libsais_bwt_omp(t, u, &mut a32, new_fs, None, threads as i32)
12094    };
12095
12096    Some(SaSint::from(index))
12097}
12098
12099fn libsais64_bwt_aux_run_32bit_omp(
12100    t: &[u8],
12101    u: &mut [u8],
12102    fs: SaSint,
12103    freq: Option<&mut [SaSint]>,
12104    r: SaSint,
12105    i: &mut [SaSint],
12106    threads: SaSint,
12107) -> Option<SaSint> {
12108    if r > i32::MAX as SaSint {
12109        return None;
12110    }
12111
12112    let new_fs = libsais64_new_32bit_fs(t.len(), fs)?;
12113    let mut a32 = vec![0i32; t.len() + usize::try_from(new_fs).expect("new_fs is non-negative")];
12114    let sample_count = if t.is_empty() {
12115        1
12116    } else {
12117        (t.len() - 1) / usize::try_from(r).expect("r must be positive") + 1
12118    };
12119    let mut i32_out = vec![0i32; sample_count];
12120
12121    let index = if let Some(freq) = freq {
12122        let mut freq32 = vec![0i32; ALPHABET_SIZE];
12123        let index = crate::libsais_bwt_aux_omp(
12124            t,
12125            u,
12126            &mut a32,
12127            new_fs,
12128            Some(&mut freq32),
12129            r as i32,
12130            &mut i32_out,
12131            threads as i32,
12132        );
12133        if index >= 0 {
12134            copy_freq32_to_64(freq, &freq32);
12135        }
12136        index
12137    } else {
12138        crate::libsais_bwt_aux_omp(
12139            t,
12140            u,
12141            &mut a32,
12142            new_fs,
12143            None,
12144            r as i32,
12145            &mut i32_out,
12146            threads as i32,
12147        )
12148    };
12149
12150    if index >= 0 {
12151        for (dst, src) in i.iter_mut().zip(i32_out.iter()).take(sample_count) {
12152            *dst = SaSint::from(*src);
12153        }
12154    }
12155
12156    Some(SaSint::from(index))
12157}
12158
12159/// Constructs the suffix array of a given string in parallel using OpenMP-style threading.
12160///
12161/// - `t` (`[0..n-1]`): the input string.
12162/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
12163/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
12164/// - `freq` (`[0..255]`): optional output symbol frequency table.
12165/// - `threads`: number of worker threads (can be 0 for the implementation default).
12166///
12167/// Returns 0 on success, -1 or -2 on error.
12168pub fn libsais64_omp(
12169    t: &[u8],
12170    sa: &mut [SaSint],
12171    fs: SaSint,
12172    freq: Option<&mut [SaSint]>,
12173    threads: SaSint,
12174) -> SaSint {
12175    if threads < 0 {
12176        return -1;
12177    }
12178    if fs < 0
12179        || sa.len()
12180            < t.len()
12181                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12182    {
12183        return -1;
12184    }
12185    if let Some(freq) = freq.as_ref() {
12186        if freq.len() < ALPHABET_SIZE {
12187            return -1;
12188        }
12189    }
12190    let n = t.len();
12191    if n <= 1 {
12192        if let Some(freq) = freq {
12193            freq[..ALPHABET_SIZE].fill(0);
12194            if n == 1 {
12195                sa[0] = 0;
12196                freq[t[0] as usize] += 1;
12197            }
12198        } else if n == 1 {
12199            sa[0] = 0;
12200        }
12201        return 0;
12202    }
12203
12204    let threads = normalize_omp_threads(threads);
12205    if n <= i32::MAX as usize {
12206        return libsais64_run_32bit_omp(t, sa, fs, freq, threads, false)
12207            .expect("n <= INT32_MAX must have 32-bit workspace");
12208    }
12209
12210    libsais64_main(t, sa, LIBSAIS_FLAGS_NONE, 0, None, fs, freq, threads)
12211}
12212
12213/// Constructs the generalized suffix array (GSA) of a given string set in parallel using OpenMP-style threading.
12214///
12215/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
12216/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
12217/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
12218/// - `freq` (`[0..255]`): optional output symbol frequency table.
12219/// - `threads`: number of worker threads (can be 0 for the implementation default).
12220///
12221/// Returns 0 on success, -1 or -2 on error.
12222pub fn libsais64_gsa_omp(
12223    t: &[u8],
12224    sa: &mut [SaSint],
12225    fs: SaSint,
12226    freq: Option<&mut [SaSint]>,
12227    threads: SaSint,
12228) -> SaSint {
12229    if threads < 0
12230        || t.last().copied().unwrap_or(0) != 0
12231        || fs < 0
12232        || sa.len()
12233            < t.len()
12234                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12235    {
12236        return -1;
12237    }
12238    if let Some(freq) = freq.as_ref() {
12239        if freq.len() < ALPHABET_SIZE {
12240            return -1;
12241        }
12242    }
12243    let n = t.len();
12244    if n <= 1 {
12245        if let Some(freq) = freq {
12246            freq[..ALPHABET_SIZE].fill(0);
12247            if n == 1 {
12248                sa[0] = 0;
12249                freq[t[0] as usize] += 1;
12250            }
12251        } else if n == 1 {
12252            sa[0] = 0;
12253        }
12254        return 0;
12255    }
12256
12257    let threads = normalize_omp_threads(threads);
12258    if n <= i32::MAX as usize {
12259        return libsais64_run_32bit_omp(t, sa, fs, freq, threads, true)
12260            .expect("n <= INT32_MAX must have 32-bit workspace");
12261    }
12262
12263    libsais64_main(t, sa, LIBSAIS_FLAGS_GSA, 0, None, fs, freq, threads)
12264}
12265
12266/// Alias for `libsais64_long_omp`. See its documentation.
12267pub fn libsais64_int_omp(
12268    t: &mut [SaSint],
12269    sa: &mut [SaSint],
12270    k: SaSint,
12271    fs: SaSint,
12272    threads: SaSint,
12273) -> SaSint {
12274    if threads < 0 {
12275        return -1;
12276    }
12277    if fs < 0
12278        || sa.len()
12279            < t.len()
12280                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12281    {
12282        return -1;
12283    }
12284    if t.len() <= 1 {
12285        if t.len() == 1 {
12286            sa[0] = 0;
12287        }
12288        return 0;
12289    }
12290
12291    libsais64_main_int(t, sa, k, fs, normalize_omp_threads(threads))
12292}
12293
12294/// Constructs the suffix array of a given integer array in parallel using OpenMP-style threading.
12295///
12296/// During construction the input array is modified, but restored at the end if no error occurred.
12297///
12298/// - `t` (`[0..n-1]`): the input integer array.
12299/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
12300/// - `k`: the alphabet size of the input integer array.
12301/// - `fs`: extra space available at the end of `sa` (can be 0, but 4k or better 6k is recommended for optimal performance).
12302/// - `threads`: number of worker threads (can be 0 for the implementation default).
12303///
12304/// Returns 0 on success, -1 or -2 on error.
12305pub fn libsais64_long_omp(
12306    t: &mut [SaSint],
12307    sa: &mut [SaSint],
12308    k: SaSint,
12309    fs: SaSint,
12310    threads: SaSint,
12311) -> SaSint {
12312    libsais64_int_omp(t, sa, k, fs, threads)
12313}
12314
12315/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string in parallel using OpenMP-style threading.
12316///
12317/// - `t` (`[0..n-1]`): the input string.
12318/// - `u` (`[0..n-1]`): the output string (can alias `t`).
12319/// - `a` (`[0..n-1+fs]`): the temporary array.
12320/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
12321/// - `freq` (`[0..255]`): optional output symbol frequency table.
12322/// - `threads`: number of worker threads (can be 0 for the implementation default).
12323///
12324/// Returns the primary index on success, -1 or -2 on error.
12325pub fn libsais64_bwt_omp(
12326    t: &[u8],
12327    u: &mut [u8],
12328    a: &mut [SaSint],
12329    fs: SaSint,
12330    freq: Option<&mut [SaSint]>,
12331    threads: SaSint,
12332) -> SaSint {
12333    let n = t.len();
12334    if threads < 0
12335        || fs < 0
12336        || u.len() < n
12337        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12338        || freq.as_ref().is_some_and(|freq| freq.len() < ALPHABET_SIZE)
12339    {
12340        return -1;
12341    }
12342
12343    if n <= 1 {
12344        if let Some(freq) = freq {
12345            freq[..ALPHABET_SIZE].fill(0);
12346            if n == 1 {
12347                u[0] = t[0];
12348                freq[t[0] as usize] += 1;
12349            }
12350        } else if n == 1 {
12351            u[0] = t[0];
12352        }
12353        return n as SaSint;
12354    }
12355
12356    let threads = normalize_omp_threads(threads);
12357    if n <= i32::MAX as usize {
12358        return libsais64_bwt_run_32bit_omp(t, u, fs, freq, threads)
12359            .expect("n <= INT32_MAX must have 32-bit workspace");
12360    }
12361
12362    let mut index = libsais64_main(t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq, threads);
12363    if index >= 0 {
12364        index += 1;
12365        let index_usize = usize::try_from(index).expect("index must be non-negative");
12366        u[0] = t[n - 1];
12367        bwt_copy_8u_omp(
12368            &mut u[1..index_usize],
12369            &a[..index_usize - 1],
12370            index - 1,
12371            threads,
12372        );
12373        bwt_copy_8u_omp(
12374            &mut u[index_usize..n],
12375            &a[index_usize..n],
12376            SaSint::try_from(n - index_usize).expect("fits"),
12377            threads,
12378        );
12379    }
12380    index
12381}
12382
12383/// Constructs the BWT of a given string with auxiliary indexes in parallel using OpenMP-style threading.
12384///
12385/// - `t` (`[0..n-1]`): the input string.
12386/// - `u` (`[0..n-1]`): the output string (can alias `t`).
12387/// - `a` (`[0..n-1+fs]`): the temporary array.
12388/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
12389/// - `freq` (`[0..255]`): optional output symbol frequency table.
12390/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
12391/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
12392/// - `threads`: number of worker threads (can be 0 for the implementation default).
12393///
12394/// Returns 0 on success, -1 or -2 on error.
12395pub fn libsais64_bwt_aux_omp(
12396    t: &[u8],
12397    u: &mut [u8],
12398    a: &mut [SaSint],
12399    fs: SaSint,
12400    freq: Option<&mut [SaSint]>,
12401    r: SaSint,
12402    i: &mut [SaSint],
12403    threads: SaSint,
12404) -> SaSint {
12405    let n = t.len();
12406    if threads < 0
12407        || fs < 0
12408        || r < 2
12409        || (r & (r - 1)) != 0
12410        || u.len() < n
12411        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12412    {
12413        return -1;
12414    }
12415    if let Some(freq) = freq.as_ref() {
12416        if freq.len() < ALPHABET_SIZE {
12417            return -1;
12418        }
12419    }
12420    let sample_count = if n == 0 {
12421        1
12422    } else {
12423        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
12424            .expect("sample count must be non-negative")
12425            + 1
12426    };
12427    if i.len() < sample_count {
12428        return -1;
12429    }
12430    if n <= 1 {
12431        if let Some(freq) = freq {
12432            freq[..ALPHABET_SIZE].fill(0);
12433            if n == 1 {
12434                u[0] = t[0];
12435                freq[t[0] as usize] += 1;
12436            }
12437        } else if n == 1 {
12438            u[0] = t[0];
12439        }
12440        i[0] = n as SaSint;
12441        return 0;
12442    }
12443
12444    let threads = normalize_omp_threads(threads);
12445    if n <= i32::MAX as usize && r <= i32::MAX as SaSint {
12446        return libsais64_bwt_aux_run_32bit_omp(t, u, fs, freq, r, i, threads)
12447            .expect("n/r <= INT32_MAX must have 32-bit workspace");
12448    }
12449
12450    let index = libsais64_main(t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq, threads);
12451    if index == 0 {
12452        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
12453        u[0] = t[n - 1];
12454        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], i[0] - 1, threads);
12455        bwt_copy_8u_omp(
12456            &mut u[split..n],
12457            &a[split..n],
12458            SaSint::try_from(n - split).expect("fits"),
12459            threads,
12460        );
12461    }
12462    index
12463}
12464
12465/// Internal helper: compute phi.
12466#[doc(hidden)]
12467pub fn compute_phi(
12468    sa: &[SaSint],
12469    plcp: &mut [SaSint],
12470    n: SaSint,
12471    omp_block_start: FastSint,
12472    omp_block_size: FastSint,
12473) {
12474    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12475    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12476    let end = start + size;
12477    let n_usize = usize::try_from(n).expect("n must be non-negative");
12478    let mut i = start;
12479    let mut k = if omp_block_start > 0 {
12480        sa[start - 1]
12481    } else {
12482        n
12483    };
12484
12485    let fast_end = omp_block_start + omp_block_size - 64 - 3;
12486    while (i as FastSint) < fast_end {
12487        plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")] = k;
12488        k = sa[i];
12489        plcp[usize::try_from(sa[i + 1]).expect("suffix index must be non-negative")] = k;
12490        k = sa[i + 1];
12491        plcp[usize::try_from(sa[i + 2]).expect("suffix index must be non-negative")] = k;
12492        k = sa[i + 2];
12493        plcp[usize::try_from(sa[i + 3]).expect("suffix index must be non-negative")] = k;
12494        k = sa[i + 3];
12495        i += 4;
12496    }
12497
12498    while i < end.min(n_usize) {
12499        plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")] = k;
12500        k = sa[i];
12501        i += 1;
12502    }
12503}
12504
12505/// Internal helper: compute phi (OpenMP variant).
12506#[doc(hidden)]
12507pub fn compute_phi_omp(sa: &[SaSint], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12508    if threads == 1 || n < 65_536 {
12509        compute_phi(sa, plcp, n, 0, n as FastSint);
12510        return;
12511    }
12512
12513    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12514    let block_stride = ((n as FastSint) / (threads as FastSint)) & !15;
12515    let plcp_addr = plcp.as_mut_ptr() as usize;
12516    let n_usize = usize::try_from(n).expect("n must be non-negative");
12517
12518    run_rayon_with_threads(threads_usize, || {
12519        (0..threads_usize).into_par_iter().for_each(|thread| {
12520            let block_start = thread as FastSint * block_stride;
12521            let block_size = if thread + 1 < threads_usize {
12522                block_stride
12523            } else {
12524                n as FastSint - block_start
12525            };
12526            let start = usize::try_from(block_start).expect("omp_block_start must be non-negative");
12527            let size = usize::try_from(block_size).expect("omp_block_size must be non-negative");
12528            let end = start + size;
12529            let mut i = start;
12530            let mut k = if block_start > 0 { sa[start - 1] } else { n };
12531            let plcp_ptr = plcp_addr as *mut SaSint;
12532
12533            let fast_end = block_start + block_size - 64 - 3;
12534            while (i as FastSint) < fast_end {
12535                unsafe {
12536                    // SA is a suffix-array permutation, so each thread writes a disjoint PLCP slot.
12537                    *plcp_ptr
12538                        .add(usize::try_from(sa[i]).expect("suffix index must be non-negative")) =
12539                        k;
12540                    k = sa[i];
12541                    *plcp_ptr.add(
12542                        usize::try_from(sa[i + 1]).expect("suffix index must be non-negative"),
12543                    ) = k;
12544                    k = sa[i + 1];
12545                    *plcp_ptr.add(
12546                        usize::try_from(sa[i + 2]).expect("suffix index must be non-negative"),
12547                    ) = k;
12548                    k = sa[i + 2];
12549                    *plcp_ptr.add(
12550                        usize::try_from(sa[i + 3]).expect("suffix index must be non-negative"),
12551                    ) = k;
12552                    k = sa[i + 3];
12553                }
12554                i += 4;
12555            }
12556
12557            while i < end.min(n_usize) {
12558                unsafe {
12559                    // SA is a suffix-array permutation, so each thread writes a disjoint PLCP slot.
12560                    *plcp_ptr
12561                        .add(usize::try_from(sa[i]).expect("suffix index must be non-negative")) =
12562                        k;
12563                }
12564                k = sa[i];
12565                i += 1;
12566            }
12567        });
12568    });
12569}
12570
12571/// Internal helper: compute plcp.
12572#[doc(hidden)]
12573pub fn compute_plcp(
12574    t: &[u8],
12575    plcp: &mut [SaSint],
12576    n: FastSint,
12577    omp_block_start: FastSint,
12578    omp_block_size: FastSint,
12579) {
12580    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12581    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12582    let end = start + size;
12583    let n_usize = usize::try_from(n).expect("n must be non-negative");
12584    let mut l = 0usize;
12585
12586    for i in start..end.min(n_usize) {
12587        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12588        let m = n_usize - i.max(k);
12589        while l < m && t[i + l] == t[k + l] {
12590            l += 1;
12591        }
12592        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12593        l = l.saturating_sub(1);
12594    }
12595}
12596
12597/// Internal helper: compute plcp (OpenMP variant).
12598#[doc(hidden)]
12599pub fn compute_plcp_omp(t: &[u8], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12600    if threads == 1 || n < 65_536 {
12601        compute_plcp(t, plcp, n as FastSint, 0, n as FastSint);
12602        return;
12603    }
12604
12605    let n_usize = usize::try_from(n).expect("n must be non-negative");
12606    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12607    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12608    run_rayon_with_threads(threads_usize, || {
12609        plcp[..n_usize]
12610            .par_chunks_mut(chunk_size)
12611            .enumerate()
12612            .for_each(|(chunk_index, chunk)| {
12613                let start = chunk_index * chunk_size;
12614                let mut l = 0usize;
12615                for (offset, value) in chunk.iter_mut().enumerate() {
12616                    let i = start + offset;
12617                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12618                    let m = n_usize - i.max(k);
12619                    while l < m && t[i + l] == t[k + l] {
12620                        l += 1;
12621                    }
12622                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12623                    l = l.saturating_sub(1);
12624                }
12625            });
12626    });
12627}
12628
12629/// Internal helper: compute plcp gsa.
12630#[doc(hidden)]
12631pub fn compute_plcp_gsa(
12632    t: &[u8],
12633    plcp: &mut [SaSint],
12634    omp_block_start: FastSint,
12635    omp_block_size: FastSint,
12636) {
12637    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12638    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12639    let end = start + size;
12640    let mut l = 0usize;
12641
12642    for i in start..end.min(t.len()) {
12643        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12644        while t[i + l] > 0 && t[i + l] == t[k + l] {
12645            l += 1;
12646        }
12647        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12648        l = l.saturating_sub(1);
12649    }
12650}
12651
12652/// Internal helper: compute plcp gsa (OpenMP variant).
12653#[doc(hidden)]
12654pub fn compute_plcp_gsa_omp(t: &[u8], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12655    if threads == 1 || n < 65_536 {
12656        compute_plcp_gsa(t, plcp, 0, n as FastSint);
12657        return;
12658    }
12659
12660    let n_usize = usize::try_from(n).expect("n must be non-negative");
12661    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12662    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12663    run_rayon_with_threads(threads_usize, || {
12664        plcp[..n_usize]
12665            .par_chunks_mut(chunk_size)
12666            .enumerate()
12667            .for_each(|(chunk_index, chunk)| {
12668                let start = chunk_index * chunk_size;
12669                let mut l = 0usize;
12670                for (offset, value) in chunk.iter_mut().enumerate() {
12671                    let i = start + offset;
12672                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12673                    while t[i + l] > 0 && t[i + l] == t[k + l] {
12674                        l += 1;
12675                    }
12676                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12677                    l = l.saturating_sub(1);
12678                }
12679            });
12680    });
12681}
12682
12683/// Internal helper: compute plcp int.
12684#[doc(hidden)]
12685pub fn compute_plcp_int(
12686    t: &[SaSint],
12687    plcp: &mut [SaSint],
12688    n: FastSint,
12689    omp_block_start: FastSint,
12690    omp_block_size: FastSint,
12691) {
12692    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12693    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12694    let end = start + size;
12695    let n_usize = usize::try_from(n).expect("n must be non-negative");
12696    let mut l = 0usize;
12697
12698    for i in start..end.min(n_usize) {
12699        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12700        let m = n_usize - i.max(k);
12701        while l < m && t[i + l] == t[k + l] {
12702            l += 1;
12703        }
12704        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12705        l = l.saturating_sub(1);
12706    }
12707}
12708
12709/// Internal helper: compute plcp int (OpenMP variant).
12710#[doc(hidden)]
12711pub fn compute_plcp_int_omp(t: &[SaSint], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12712    if threads == 1 || n < 65_536 {
12713        compute_plcp_int(t, plcp, n as FastSint, 0, n as FastSint);
12714        return;
12715    }
12716
12717    let n_usize = usize::try_from(n).expect("n must be non-negative");
12718    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12719    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12720    run_rayon_with_threads(threads_usize, || {
12721        plcp[..n_usize]
12722            .par_chunks_mut(chunk_size)
12723            .enumerate()
12724            .for_each(|(chunk_index, chunk)| {
12725                let start = chunk_index * chunk_size;
12726                let mut l = 0usize;
12727                for (offset, value) in chunk.iter_mut().enumerate() {
12728                    let i = start + offset;
12729                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12730                    let m = n_usize - i.max(k);
12731                    while l < m && t[i + l] == t[k + l] {
12732                        l += 1;
12733                    }
12734                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12735                    l = l.saturating_sub(1);
12736                }
12737            });
12738    });
12739}
12740
12741/// Internal helper: compute lcp.
12742#[doc(hidden)]
12743pub fn compute_lcp(
12744    plcp: &[SaSint],
12745    sa: &[SaSint],
12746    lcp: &mut [SaSint],
12747    omp_block_start: FastSint,
12748    omp_block_size: FastSint,
12749) {
12750    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12751    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12752    let end = start + size;
12753
12754    for i in start..end.min(sa.len()) {
12755        lcp[i] = plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")];
12756    }
12757}
12758
12759/// Internal helper: compute lcp (OpenMP variant).
12760#[doc(hidden)]
12761pub fn compute_lcp_omp(
12762    plcp: &[SaSint],
12763    sa: &[SaSint],
12764    lcp: &mut [SaSint],
12765    n: SaSint,
12766    threads: SaSint,
12767) {
12768    if threads == 1 || n < 65_536 {
12769        compute_lcp(plcp, sa, lcp, 0, n as FastSint);
12770        return;
12771    }
12772
12773    let n_usize = usize::try_from(n).expect("n must be non-negative");
12774    assert!(plcp.len() >= n_usize);
12775    assert!(sa.len() >= n_usize);
12776    assert!(lcp.len() >= n_usize);
12777    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12778    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12779    let plcp_ptr = plcp.as_ptr() as usize;
12780    let sa_ptr = sa.as_ptr() as usize;
12781    run_rayon_with_threads(threads_usize, || {
12782        lcp[..n_usize]
12783            .par_chunks_mut(chunk_size)
12784            .enumerate()
12785            .for_each(|(chunk_index, chunk)| {
12786                let start = chunk_index * chunk_size;
12787                let dst_ptr = chunk.as_mut_ptr();
12788                let sa_ptr = sa_ptr as *const SaSint;
12789                let plcp_ptr = plcp_ptr as *const SaSint;
12790                for offset in 0..chunk.len() {
12791                    let i = start + offset;
12792                    let suffix = unsafe { *sa_ptr.add(i) };
12793                    let suffix =
12794                        usize::try_from(suffix).expect("suffix index must be non-negative");
12795                    assert!(suffix < plcp.len());
12796                    unsafe {
12797                        *dst_ptr.add(offset) = *plcp_ptr.add(suffix);
12798                    }
12799                }
12800            });
12801    });
12802}
12803
12804/// Constructs the permuted longest common prefix array (PLCP) of a given string and suffix array.
12805///
12806/// - `t` (`[0..n-1]`): the input string.
12807/// - `sa` (`[0..n-1]`): the input suffix array.
12808/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12809///
12810/// Returns 0 on success, -1 on error.
12811pub fn libsais64_plcp(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12812    if sa.len() != t.len() || plcp.len() != t.len() {
12813        return -1;
12814    }
12815    if !suffix_entries_in_bounds(sa, t.len()) {
12816        return -1;
12817    }
12818    if t.len() <= 1 {
12819        if t.len() == 1 {
12820            plcp[0] = 0;
12821        }
12822        return 0;
12823    }
12824
12825    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12826    compute_phi_omp(sa, plcp, n, 1);
12827    compute_plcp_omp(t, plcp, n, 1);
12828    0
12829}
12830
12831/// Constructs the PLCP of a given string set and generalized suffix array (GSA).
12832///
12833/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
12834/// - `sa` (`[0..n-1]`): the input generalized suffix array.
12835/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12836///
12837/// Returns 0 on success, -1 on error.
12838pub fn libsais64_plcp_gsa(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12839    if t.last().copied().unwrap_or(0) != 0 {
12840        return -1;
12841    }
12842    if sa.len() != t.len() || plcp.len() != t.len() {
12843        return -1;
12844    }
12845    if !suffix_entries_in_bounds(sa, t.len()) {
12846        return -1;
12847    }
12848    if t.len() <= 1 {
12849        if t.len() == 1 {
12850            plcp[0] = 0;
12851        }
12852        return 0;
12853    }
12854
12855    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12856    compute_phi_omp(sa, plcp, n, 1);
12857    compute_plcp_gsa_omp(t, plcp, n, 1);
12858    0
12859}
12860
12861/// Constructs the PLCP of a given integer array and suffix array.
12862///
12863/// - `t` (`[0..n-1]`): the input integer array.
12864/// - `sa` (`[0..n-1]`): the input suffix array.
12865/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12866///
12867/// Returns 0 on success, -1 on error.
12868pub fn libsais64_plcp_int(t: &[SaSint], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12869    if sa.len() != t.len() || plcp.len() != t.len() {
12870        return -1;
12871    }
12872    if !suffix_entries_in_bounds(sa, t.len()) {
12873        return -1;
12874    }
12875    if t.len() <= 1 {
12876        if t.len() == 1 {
12877            plcp[0] = 0;
12878        }
12879        return 0;
12880    }
12881
12882    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12883    compute_phi_omp(sa, plcp, n, 1);
12884    compute_plcp_int_omp(t, plcp, n, 1);
12885    0
12886}
12887
12888/// Constructs the longest common prefix array (LCP) from a PLCP and suffix array.
12889///
12890/// - `plcp` (`[0..n-1]`): the input permuted longest common prefix array.
12891/// - `sa` (`[0..n-1]`): the input suffix array or generalized suffix array (GSA).
12892/// - `lcp` (`[0..n-1]`): the output longest common prefix array (can alias `sa`).
12893///
12894/// Returns 0 on success, -1 on error.
12895pub fn libsais64_lcp(plcp: &[SaSint], sa: &[SaSint], lcp: &mut [SaSint]) -> SaSint {
12896    if plcp.len() != sa.len() || lcp.len() != sa.len() {
12897        return -1;
12898    }
12899    if !suffix_entries_in_bounds(sa, plcp.len()) {
12900        return -1;
12901    }
12902    if sa.len() <= 1 {
12903        if sa.len() == 1 {
12904            lcp[0] = plcp[usize::try_from(sa[0]).expect("suffix index must be non-negative")];
12905        }
12906        return 0;
12907    }
12908
12909    compute_lcp_omp(
12910        plcp,
12911        sa,
12912        lcp,
12913        SaSint::try_from(sa.len()).expect("suffix array length must fit SaSint"),
12914        1,
12915    );
12916    0
12917}
12918
12919/// Constructs the PLCP of a given string and suffix array in parallel using OpenMP-style threading.
12920///
12921/// - `t` (`[0..n-1]`): the input string.
12922/// - `sa` (`[0..n-1]`): the input suffix array.
12923/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12924/// - `threads`: number of worker threads (can be 0 for the implementation default).
12925///
12926/// Returns 0 on success, -1 on error.
12927pub fn libsais64_plcp_omp(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint], threads: SaSint) -> SaSint {
12928    if threads < 0 {
12929        return -1;
12930    }
12931    if sa.len() != t.len() || plcp.len() != t.len() {
12932        return -1;
12933    }
12934    if !suffix_entries_in_bounds(sa, t.len()) {
12935        return -1;
12936    }
12937    if t.len() <= 1 {
12938        if t.len() == 1 {
12939            plcp[0] = 0;
12940        }
12941        return 0;
12942    }
12943
12944    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12945    let threads = normalize_omp_threads(threads);
12946    compute_phi_omp(sa, plcp, n, threads);
12947    compute_plcp_omp(t, plcp, n, threads);
12948    0
12949}
12950
12951/// Constructs the PLCP of a given string set and GSA in parallel using OpenMP-style threading.
12952///
12953/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
12954/// - `sa` (`[0..n-1]`): the input generalized suffix array.
12955/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12956/// - `threads`: number of worker threads (can be 0 for the implementation default).
12957///
12958/// Returns 0 on success, -1 on error.
12959pub fn libsais64_plcp_gsa_omp(
12960    t: &[u8],
12961    sa: &[SaSint],
12962    plcp: &mut [SaSint],
12963    threads: SaSint,
12964) -> SaSint {
12965    if threads < 0 || t.last().copied().unwrap_or(0) != 0 {
12966        return -1;
12967    }
12968    if sa.len() != t.len() || plcp.len() != t.len() {
12969        return -1;
12970    }
12971    if !suffix_entries_in_bounds(sa, t.len()) {
12972        return -1;
12973    }
12974    if t.len() <= 1 {
12975        if t.len() == 1 {
12976            plcp[0] = 0;
12977        }
12978        return 0;
12979    }
12980
12981    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12982    let threads = normalize_omp_threads(threads);
12983    compute_phi_omp(sa, plcp, n, threads);
12984    compute_plcp_gsa_omp(t, plcp, n, threads);
12985    0
12986}
12987
12988/// Constructs the PLCP of a given integer array and suffix array in parallel using OpenMP-style threading.
12989///
12990/// - `t` (`[0..n-1]`): the input integer array.
12991/// - `sa` (`[0..n-1]`): the input suffix array.
12992/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12993/// - `threads`: number of worker threads (can be 0 for the implementation default).
12994///
12995/// Returns 0 on success, -1 on error.
12996pub fn libsais64_plcp_int_omp(
12997    t: &[SaSint],
12998    sa: &[SaSint],
12999    plcp: &mut [SaSint],
13000    threads: SaSint,
13001) -> SaSint {
13002    if threads < 0 {
13003        return -1;
13004    }
13005    if sa.len() != t.len() || plcp.len() != t.len() {
13006        return -1;
13007    }
13008    if !suffix_entries_in_bounds(sa, t.len()) {
13009        return -1;
13010    }
13011    if t.len() <= 1 {
13012        if t.len() == 1 {
13013            plcp[0] = 0;
13014        }
13015        return 0;
13016    }
13017
13018    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
13019    let threads = normalize_omp_threads(threads);
13020    compute_phi_omp(sa, plcp, n, threads);
13021    compute_plcp_int_omp(t, plcp, n, threads);
13022    0
13023}
13024
13025/// Constructs the LCP from a PLCP and suffix array in parallel using OpenMP-style threading.
13026///
13027/// - `plcp` (`[0..n-1]`): the input permuted longest common prefix array.
13028/// - `sa` (`[0..n-1]`): the input suffix array or generalized suffix array (GSA).
13029/// - `lcp` (`[0..n-1]`): the output longest common prefix array (can alias `sa`).
13030/// - `threads`: number of worker threads (can be 0 for the implementation default).
13031///
13032/// Returns 0 on success, -1 on error.
13033pub fn libsais64_lcp_omp(
13034    plcp: &[SaSint],
13035    sa: &[SaSint],
13036    lcp: &mut [SaSint],
13037    threads: SaSint,
13038) -> SaSint {
13039    if threads < 0 {
13040        return -1;
13041    }
13042    if plcp.len() != sa.len() || lcp.len() != sa.len() {
13043        return -1;
13044    }
13045    if !suffix_entries_in_bounds(sa, plcp.len()) {
13046        return -1;
13047    }
13048    if sa.len() <= 1 {
13049        if sa.len() == 1 {
13050            lcp[0] = plcp[usize::try_from(sa[0]).expect("suffix index must be non-negative")];
13051        }
13052        return 0;
13053    }
13054
13055    compute_lcp_omp(
13056        plcp,
13057        sa,
13058        lcp,
13059        SaSint::try_from(sa.len()).expect("suffix array length must fit SaSint"),
13060        normalize_omp_threads(threads),
13061    );
13062    0
13063}
13064
13065fn suffix_entries_in_bounds(sa: &[SaSint], len: usize) -> bool {
13066    sa.iter()
13067        .all(|&value| usize::try_from(value).is_ok_and(|index| index < len))
13068}
13069
13070/// Internal helper: unbwt compute histogram.
13071#[doc(hidden)]
13072pub fn unbwt_compute_histogram(t: &[u8], n: FastSint, count: &mut [SaUint]) {
13073    let n = usize::try_from(n).expect("n must be non-negative");
13074    assert!(count.len() >= ALPHABET_SIZE);
13075    for &byte in &t[..n] {
13076        count[byte as usize] += 1;
13077    }
13078}
13079
13080/// Internal helper: unbwt transpose bucket2.
13081#[doc(hidden)]
13082pub fn unbwt_transpose_bucket2(bucket2: &mut [SaUint]) {
13083    assert!(bucket2.len() >= ALPHABET_SIZE * ALPHABET_SIZE);
13084    for x in 0..ALPHABET_SIZE {
13085        for y in x + 1..ALPHABET_SIZE {
13086            bucket2.swap((y << 8) + x, (x << 8) + y);
13087        }
13088    }
13089}
13090
13091/// Internal helper: unbwt compute bigram histogram single.
13092#[doc(hidden)]
13093pub fn unbwt_compute_bigram_histogram_single(
13094    t: &[u8],
13095    bucket1: &mut [SaUint],
13096    bucket2: &mut [SaUint],
13097    index: FastUint,
13098) {
13099    let mut sum = 1usize;
13100    for c in 0..ALPHABET_SIZE {
13101        let prev = sum;
13102        sum += bucket1[c] as usize;
13103        bucket1[c] = prev as SaUint;
13104        if prev != sum {
13105            let bucket2_p = &mut bucket2[c << 8..(c + 1) << 8];
13106
13107            let hi = sum.min(index);
13108            if hi > prev {
13109                unbwt_compute_histogram(&t[prev..], (hi - prev) as FastSint, bucket2_p);
13110            }
13111
13112            let lo = prev.max(index + 1);
13113            if sum > lo {
13114                unbwt_compute_histogram(&t[lo - 1..], (sum - lo) as FastSint, bucket2_p);
13115            }
13116        }
13117    }
13118
13119    unbwt_transpose_bucket2(bucket2);
13120}
13121
13122/// Internal helper: unbwt calculate fastbits.
13123#[doc(hidden)]
13124pub fn unbwt_calculate_fastbits(
13125    bucket2: &mut [SaUint],
13126    fastbits: &mut [u16],
13127    lastc: FastUint,
13128    shift: FastUint,
13129) {
13130    let mut v = 0usize;
13131    let mut w = 0usize;
13132    let mut sum = 1usize;
13133
13134    for c in 0..ALPHABET_SIZE {
13135        if c == lastc {
13136            sum += 1;
13137        }
13138
13139        for _d in 0..ALPHABET_SIZE {
13140            let prev = sum;
13141            sum += bucket2[w] as usize;
13142            bucket2[w] = prev as SaUint;
13143            if prev != sum {
13144                while v <= ((sum - 1) >> shift) {
13145                    fastbits[v] = w as u16;
13146                    v += 1;
13147                }
13148            }
13149            w += 1;
13150        }
13151    }
13152}
13153
13154/// Internal helper: unbwt calculate bi psi.
13155#[doc(hidden)]
13156pub fn unbwt_calculate_bi_psi(
13157    t: &[u8],
13158    p: &mut [SaUint],
13159    bucket1: &mut [SaUint],
13160    bucket2: &mut [SaUint],
13161    index: FastUint,
13162    omp_block_start: FastSint,
13163    omp_block_end: FastSint,
13164) {
13165    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
13166    let mut j = index;
13167    let block_end = usize::try_from(omp_block_end).expect("omp_block_end must be non-negative");
13168    if block_end < j {
13169        j = block_end;
13170    }
13171    while i < j {
13172        let c = t[i] as usize;
13173        let pidx = bucket1[c] as usize;
13174        bucket1[c] += 1;
13175        let tidx = index.wrapping_sub(pidx) as i64;
13176        if tidx != 0 {
13177            let src = pidx.wrapping_add((tidx >> 63) as usize);
13178            let w = ((t[src] as usize) << 8) + c;
13179            let dst = bucket2[w] as usize;
13180            p[dst] = i as SaUint;
13181            bucket2[w] += 1;
13182        }
13183        i += 1;
13184    }
13185
13186    let mut i = index;
13187    if usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") > i {
13188        i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
13189    }
13190    i += 1;
13191    while i <= block_end {
13192        let c = t[i - 1] as usize;
13193        let pidx = bucket1[c] as usize;
13194        bucket1[c] += 1;
13195        let tidx = index.wrapping_sub(pidx) as i64;
13196        if tidx != 0 {
13197            let src = pidx.wrapping_add((tidx >> 63) as usize);
13198            let w = ((t[src] as usize) << 8) + c;
13199            let dst = bucket2[w] as usize;
13200            p[dst] = i as SaUint;
13201            bucket2[w] += 1;
13202        }
13203        i += 1;
13204    }
13205}
13206
13207/// Internal helper: unbwt calculate biPSI.
13208#[doc(hidden)]
13209#[allow(dead_code, non_snake_case)]
13210pub fn unbwt_calculate_biPSI(
13211    t: &[u8],
13212    p: &mut [SaUint],
13213    bucket1: &mut [SaUint],
13214    bucket2: &mut [SaUint],
13215    index: FastUint,
13216    omp_block_start: FastSint,
13217    omp_block_end: FastSint,
13218) {
13219    unbwt_calculate_bi_psi(
13220        t,
13221        p,
13222        bucket1,
13223        bucket2,
13224        index,
13225        omp_block_start,
13226        omp_block_end,
13227    );
13228}
13229
13230/// Internal helper: unbwt init single.
13231#[doc(hidden)]
13232pub fn unbwt_init_single(
13233    t: &[u8],
13234    p: &mut [SaUint],
13235    n: SaSint,
13236    freq: Option<&[SaSint]>,
13237    i: &[SaUint],
13238    bucket2: &mut [SaUint],
13239    fastbits: &mut [u16],
13240) {
13241    let mut bucket1 = vec![0u64; ALPHABET_SIZE];
13242    let index = i[0] as usize;
13243    let lastc = t[0] as usize;
13244    let mut shift = 0usize;
13245    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13246        > (1usize << UNBWT_FASTBITS)
13247    {
13248        shift += 1;
13249    }
13250
13251    if let Some(freq) = freq {
13252        for c in 0..ALPHABET_SIZE {
13253            bucket1[c] = freq[c] as SaUint;
13254        }
13255    } else {
13256        unbwt_compute_histogram(t, n as FastSint, &mut bucket1);
13257    }
13258
13259    bucket2.fill(0);
13260    unbwt_compute_bigram_histogram_single(t, &mut bucket1, bucket2, index);
13261    unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
13262    unbwt_calculate_bi_psi(t, p, &mut bucket1, bucket2, index, 0, n as FastSint);
13263}
13264
13265/// Internal helper: unbwt compute bigram histogram parallel.
13266#[doc(hidden)]
13267pub fn unbwt_compute_bigram_histogram_parallel(
13268    t: &[u8],
13269    index: FastUint,
13270    bucket1: &mut [SaUint],
13271    bucket2: &mut [SaUint],
13272    omp_block_start: FastSint,
13273    omp_block_size: FastSint,
13274) {
13275    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
13276    let end = start + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
13277    for &c_u8 in &t[start..end] {
13278        let c = c_u8 as usize;
13279        let p = bucket1[c] as usize;
13280        bucket1[c] += 1;
13281        let tidx = index.wrapping_sub(p) as i64;
13282        if tidx != 0 {
13283            let src = p.wrapping_add((tidx >> 63) as usize);
13284            let w = ((t[src] as usize) << 8) + c;
13285            bucket2[w] += 1;
13286        }
13287    }
13288}
13289
13290/// Internal helper: unbwt init parallel.
13291#[doc(hidden)]
13292pub fn unbwt_init_parallel(
13293    t: &[u8],
13294    p: &mut [SaUint],
13295    n: SaSint,
13296    freq: Option<&[SaSint]>,
13297    i: &[SaUint],
13298    bucket2: &mut [SaUint],
13299    fastbits: &mut [u16],
13300    buckets: Option<&mut [SaUint]>,
13301    threads: SaSint,
13302) {
13303    let num_threads = usize::try_from(threads.max(1)).expect("threads must be non-negative");
13304    if num_threads <= 1 || usize::try_from(n).expect("n must be non-negative") < 65_536 {
13305        unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
13306        return;
13307    }
13308
13309    let buckets = match buckets {
13310        Some(buckets) => buckets,
13311        None => {
13312            unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
13313            return;
13314        }
13315    };
13316
13317    let segment_len = ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE;
13318    assert!(buckets.len() >= num_threads * segment_len);
13319
13320    let index = i[0] as usize;
13321    let lastc = t[0] as usize;
13322    let mut shift = 0usize;
13323    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13324        > (1usize << UNBWT_FASTBITS)
13325    {
13326        shift += 1;
13327    }
13328
13329    let mut bucket1 = vec![0u64; ALPHABET_SIZE];
13330    bucket2.fill(0);
13331
13332    let n_fast = n as FastSint;
13333    let block_stride = (n_fast / num_threads as FastSint) & (-16);
13334    let mut block_starts = vec![0usize; num_threads];
13335    let mut block_sizes = vec![0usize; num_threads];
13336
13337    for thread in 0..num_threads {
13338        let start = usize::try_from(thread as FastSint * block_stride)
13339            .expect("block start must be non-negative");
13340        let size = if thread + 1 < num_threads {
13341            usize::try_from(block_stride).expect("block stride must be non-negative")
13342        } else {
13343            usize::try_from(n_fast - thread as FastSint * block_stride)
13344                .expect("block size must be non-negative")
13345        };
13346        block_starts[thread] = start;
13347        block_sizes[thread] = size;
13348
13349        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13350        let (bucket1_local, _) = segment.split_at_mut(ALPHABET_SIZE);
13351        bucket1_local.fill(0);
13352        unbwt_compute_histogram(&t[start..], size as FastSint, bucket1_local);
13353    }
13354
13355    for thread in 0..num_threads {
13356        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13357        let (bucket1_temp, _) = segment.split_at_mut(ALPHABET_SIZE);
13358        for c in 0..ALPHABET_SIZE {
13359            let a = bucket1[c];
13360            let b = bucket1_temp[c];
13361            bucket1[c] = a + b;
13362            bucket1_temp[c] = a;
13363        }
13364    }
13365
13366    let mut sum = 1usize;
13367    for c in 0..ALPHABET_SIZE {
13368        let prev = sum;
13369        sum += bucket1[c] as usize;
13370        bucket1[c] = prev as SaUint;
13371    }
13372
13373    for thread in 0..num_threads {
13374        let start = block_starts[thread];
13375        let size = block_sizes[thread];
13376        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13377        let (bucket1_local, bucket2_local) = segment.split_at_mut(ALPHABET_SIZE);
13378        for c in 0..ALPHABET_SIZE {
13379            bucket1_local[c] += bucket1[c];
13380        }
13381        bucket2_local.fill(0);
13382        unbwt_compute_bigram_histogram_parallel(
13383            t,
13384            index,
13385            bucket1_local,
13386            bucket2_local,
13387            start as FastSint,
13388            size as FastSint,
13389        );
13390    }
13391
13392    for thread in 0..num_threads {
13393        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13394        let (_, bucket2_temp) = segment.split_at_mut(ALPHABET_SIZE);
13395        for c in 0..ALPHABET_SIZE * ALPHABET_SIZE {
13396            let a = bucket2[c];
13397            let b = bucket2_temp[c];
13398            bucket2[c] = a + b;
13399            bucket2_temp[c] = a;
13400        }
13401    }
13402
13403    unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
13404
13405    for thread in (1..num_threads).rev() {
13406        let src_start = (thread - 1) * segment_len;
13407        let dst_start = thread * segment_len;
13408        let (head, tail) = buckets.split_at_mut(dst_start);
13409        let src = &head[src_start..src_start + ALPHABET_SIZE];
13410        let dst = &mut tail[..ALPHABET_SIZE];
13411        dst.copy_from_slice(src);
13412    }
13413    buckets[..ALPHABET_SIZE].copy_from_slice(&bucket1);
13414
13415    for thread in 0..num_threads {
13416        let start = block_starts[thread];
13417        let size = block_sizes[thread];
13418        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13419        let (bucket1_local, bucket2_local) = segment.split_at_mut(ALPHABET_SIZE);
13420        for c in 0..ALPHABET_SIZE * ALPHABET_SIZE {
13421            bucket2_local[c] += bucket2[c];
13422        }
13423        unbwt_calculate_bi_psi(
13424            t,
13425            p,
13426            bucket1_local,
13427            bucket2_local,
13428            index,
13429            start as FastSint,
13430            (start + size) as FastSint,
13431        );
13432    }
13433
13434    let last_segment = &buckets[(num_threads - 1) * segment_len..num_threads * segment_len];
13435    let (_, last_bucket2) = last_segment.split_at(ALPHABET_SIZE);
13436    bucket2.copy_from_slice(last_bucket2);
13437}
13438
13439fn bswap16(value: u16) -> u16 {
13440    value.swap_bytes()
13441}
13442
13443fn unbwt_resolve_symbol(bucket2: &[SaUint], fastbits: &[u16], shift: FastUint, p: SaUint) -> u16 {
13444    let mut c = fastbits[(p as usize) >> shift];
13445    while bucket2[c as usize] <= p {
13446        c += 1;
13447    }
13448    c
13449}
13450
13451/// Internal helper: unbwt decode 1.
13452#[doc(hidden)]
13453pub fn unbwt_decode_1(
13454    u: &mut [u8],
13455    p: &[SaUint],
13456    bucket2: &[SaUint],
13457    fastbits: &[u16],
13458    shift: FastUint,
13459    i0: &mut FastUint,
13460    k: FastUint,
13461) {
13462    let words = &mut u[..2 * k];
13463    let mut p0 = *i0 as SaUint;
13464
13465    for i in 0..k {
13466        let c0 = unbwt_resolve_symbol(bucket2, fastbits, shift, p0);
13467        p0 = p[p0 as usize];
13468        let bytes = bswap16(c0).to_ne_bytes();
13469        words[2 * i] = bytes[0];
13470        words[2 * i + 1] = bytes[1];
13471    }
13472
13473    *i0 = p0 as FastUint;
13474}
13475
13476/// Internal helper: unbwt decode 2.
13477#[doc(hidden)]
13478pub fn unbwt_decode_2(
13479    u: &mut [u8],
13480    p: &[SaUint],
13481    bucket2: &[SaUint],
13482    fastbits: &[u16],
13483    shift: FastUint,
13484    r: FastUint,
13485    i0: &mut FastUint,
13486    i1: &mut FastUint,
13487    k: FastUint,
13488) {
13489    let width = 2 * k;
13490    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13491    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13492}
13493
13494/// Internal helper: unbwt decode 3.
13495#[doc(hidden)]
13496pub fn unbwt_decode_3(
13497    u: &mut [u8],
13498    p: &[SaUint],
13499    bucket2: &[SaUint],
13500    fastbits: &[u16],
13501    shift: FastUint,
13502    r: FastUint,
13503    i0: &mut FastUint,
13504    i1: &mut FastUint,
13505    i2: &mut FastUint,
13506    k: FastUint,
13507) {
13508    let width = 2 * k;
13509    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13510    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13511    unbwt_decode_1(
13512        &mut u[2 * r..2 * r + width],
13513        p,
13514        bucket2,
13515        fastbits,
13516        shift,
13517        i2,
13518        k,
13519    );
13520}
13521
13522/// Internal helper: unbwt decode 4.
13523#[doc(hidden)]
13524pub fn unbwt_decode_4(
13525    u: &mut [u8],
13526    p: &[SaUint],
13527    bucket2: &[SaUint],
13528    fastbits: &[u16],
13529    shift: FastUint,
13530    r: FastUint,
13531    i0: &mut FastUint,
13532    i1: &mut FastUint,
13533    i2: &mut FastUint,
13534    i3: &mut FastUint,
13535    k: FastUint,
13536) {
13537    let width = 2 * k;
13538    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13539    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13540    unbwt_decode_1(
13541        &mut u[2 * r..2 * r + width],
13542        p,
13543        bucket2,
13544        fastbits,
13545        shift,
13546        i2,
13547        k,
13548    );
13549    unbwt_decode_1(
13550        &mut u[3 * r..3 * r + width],
13551        p,
13552        bucket2,
13553        fastbits,
13554        shift,
13555        i3,
13556        k,
13557    );
13558}
13559
13560/// Internal helper: unbwt decode 5.
13561#[doc(hidden)]
13562pub fn unbwt_decode_5(
13563    u: &mut [u8],
13564    p: &[SaUint],
13565    bucket2: &[SaUint],
13566    fastbits: &[u16],
13567    shift: FastUint,
13568    r: FastUint,
13569    i0: &mut FastUint,
13570    i1: &mut FastUint,
13571    i2: &mut FastUint,
13572    i3: &mut FastUint,
13573    i4: &mut FastUint,
13574    k: FastUint,
13575) {
13576    let width = 2 * k;
13577    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13578    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13579    unbwt_decode_1(
13580        &mut u[2 * r..2 * r + width],
13581        p,
13582        bucket2,
13583        fastbits,
13584        shift,
13585        i2,
13586        k,
13587    );
13588    unbwt_decode_1(
13589        &mut u[3 * r..3 * r + width],
13590        p,
13591        bucket2,
13592        fastbits,
13593        shift,
13594        i3,
13595        k,
13596    );
13597    unbwt_decode_1(
13598        &mut u[4 * r..4 * r + width],
13599        p,
13600        bucket2,
13601        fastbits,
13602        shift,
13603        i4,
13604        k,
13605    );
13606}
13607
13608/// Internal helper: unbwt decode 6.
13609#[doc(hidden)]
13610pub fn unbwt_decode_6(
13611    u: &mut [u8],
13612    p: &[SaUint],
13613    bucket2: &[SaUint],
13614    fastbits: &[u16],
13615    shift: FastUint,
13616    r: FastUint,
13617    i0: &mut FastUint,
13618    i1: &mut FastUint,
13619    i2: &mut FastUint,
13620    i3: &mut FastUint,
13621    i4: &mut FastUint,
13622    i5: &mut FastUint,
13623    k: FastUint,
13624) {
13625    let width = 2 * k;
13626    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13627    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13628    unbwt_decode_1(
13629        &mut u[2 * r..2 * r + width],
13630        p,
13631        bucket2,
13632        fastbits,
13633        shift,
13634        i2,
13635        k,
13636    );
13637    unbwt_decode_1(
13638        &mut u[3 * r..3 * r + width],
13639        p,
13640        bucket2,
13641        fastbits,
13642        shift,
13643        i3,
13644        k,
13645    );
13646    unbwt_decode_1(
13647        &mut u[4 * r..4 * r + width],
13648        p,
13649        bucket2,
13650        fastbits,
13651        shift,
13652        i4,
13653        k,
13654    );
13655    unbwt_decode_1(
13656        &mut u[5 * r..5 * r + width],
13657        p,
13658        bucket2,
13659        fastbits,
13660        shift,
13661        i5,
13662        k,
13663    );
13664}
13665
13666/// Internal helper: unbwt decode 7.
13667#[doc(hidden)]
13668pub fn unbwt_decode_7(
13669    u: &mut [u8],
13670    p: &[SaUint],
13671    bucket2: &[SaUint],
13672    fastbits: &[u16],
13673    shift: FastUint,
13674    r: FastUint,
13675    i0: &mut FastUint,
13676    i1: &mut FastUint,
13677    i2: &mut FastUint,
13678    i3: &mut FastUint,
13679    i4: &mut FastUint,
13680    i5: &mut FastUint,
13681    i6: &mut FastUint,
13682    k: FastUint,
13683) {
13684    let width = 2 * k;
13685    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13686    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13687    unbwt_decode_1(
13688        &mut u[2 * r..2 * r + width],
13689        p,
13690        bucket2,
13691        fastbits,
13692        shift,
13693        i2,
13694        k,
13695    );
13696    unbwt_decode_1(
13697        &mut u[3 * r..3 * r + width],
13698        p,
13699        bucket2,
13700        fastbits,
13701        shift,
13702        i3,
13703        k,
13704    );
13705    unbwt_decode_1(
13706        &mut u[4 * r..4 * r + width],
13707        p,
13708        bucket2,
13709        fastbits,
13710        shift,
13711        i4,
13712        k,
13713    );
13714    unbwt_decode_1(
13715        &mut u[5 * r..5 * r + width],
13716        p,
13717        bucket2,
13718        fastbits,
13719        shift,
13720        i5,
13721        k,
13722    );
13723    unbwt_decode_1(
13724        &mut u[6 * r..6 * r + width],
13725        p,
13726        bucket2,
13727        fastbits,
13728        shift,
13729        i6,
13730        k,
13731    );
13732}
13733
13734/// Internal helper: unbwt decode 8.
13735#[doc(hidden)]
13736pub fn unbwt_decode_8(
13737    u: &mut [u8],
13738    p: &[SaUint],
13739    bucket2: &[SaUint],
13740    fastbits: &[u16],
13741    shift: FastUint,
13742    r: FastUint,
13743    i0: &mut FastUint,
13744    i1: &mut FastUint,
13745    i2: &mut FastUint,
13746    i3: &mut FastUint,
13747    i4: &mut FastUint,
13748    i5: &mut FastUint,
13749    i6: &mut FastUint,
13750    i7: &mut FastUint,
13751    k: FastUint,
13752) {
13753    let width = 2 * k;
13754    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13755    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13756    unbwt_decode_1(
13757        &mut u[2 * r..2 * r + width],
13758        p,
13759        bucket2,
13760        fastbits,
13761        shift,
13762        i2,
13763        k,
13764    );
13765    unbwt_decode_1(
13766        &mut u[3 * r..3 * r + width],
13767        p,
13768        bucket2,
13769        fastbits,
13770        shift,
13771        i3,
13772        k,
13773    );
13774    unbwt_decode_1(
13775        &mut u[4 * r..4 * r + width],
13776        p,
13777        bucket2,
13778        fastbits,
13779        shift,
13780        i4,
13781        k,
13782    );
13783    unbwt_decode_1(
13784        &mut u[5 * r..5 * r + width],
13785        p,
13786        bucket2,
13787        fastbits,
13788        shift,
13789        i5,
13790        k,
13791    );
13792    unbwt_decode_1(
13793        &mut u[6 * r..6 * r + width],
13794        p,
13795        bucket2,
13796        fastbits,
13797        shift,
13798        i6,
13799        k,
13800    );
13801    unbwt_decode_1(
13802        &mut u[7 * r..7 * r + width],
13803        p,
13804        bucket2,
13805        fastbits,
13806        shift,
13807        i7,
13808        k,
13809    );
13810}
13811
13812/// Internal helper: unbwt decode.
13813#[doc(hidden)]
13814pub fn unbwt_decode(
13815    u: &mut [u8],
13816    p: &[SaUint],
13817    n: SaSint,
13818    r: SaSint,
13819    i: &[SaUint],
13820    bucket2: &[SaUint],
13821    fastbits: &[u16],
13822    mut blocks: FastSint,
13823    remainder: FastUint,
13824) {
13825    let mut shift = 0usize;
13826    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13827        > (1usize << UNBWT_FASTBITS)
13828    {
13829        shift += 1;
13830    }
13831    let mut offset = 0usize;
13832    let mut i_index = 0usize;
13833    let r_usize = usize::try_from(r).expect("r must be non-negative");
13834
13835    while blocks > 8 {
13836        let mut i0 = i[i_index] as FastUint;
13837        let mut i1 = i[i_index + 1] as FastUint;
13838        let mut i2 = i[i_index + 2] as FastUint;
13839        let mut i3 = i[i_index + 3] as FastUint;
13840        let mut i4 = i[i_index + 4] as FastUint;
13841        let mut i5 = i[i_index + 5] as FastUint;
13842        let mut i6 = i[i_index + 6] as FastUint;
13843        let mut i7 = i[i_index + 7] as FastUint;
13844        unbwt_decode_8(
13845            &mut u[offset..],
13846            p,
13847            bucket2,
13848            fastbits,
13849            shift,
13850            r_usize,
13851            &mut i0,
13852            &mut i1,
13853            &mut i2,
13854            &mut i3,
13855            &mut i4,
13856            &mut i5,
13857            &mut i6,
13858            &mut i7,
13859            r_usize >> 1,
13860        );
13861        i_index += 8;
13862        blocks -= 8;
13863        offset += 8 * r_usize;
13864    }
13865
13866    match blocks {
13867        1 => {
13868            let mut i0 = i[i_index] as FastUint;
13869            unbwt_decode_1(
13870                &mut u[offset..],
13871                p,
13872                bucket2,
13873                fastbits,
13874                shift,
13875                &mut i0,
13876                remainder >> 1,
13877            );
13878        }
13879        2 => {
13880            let mut i0 = i[i_index] as FastUint;
13881            let mut i1 = i[i_index + 1] as FastUint;
13882            unbwt_decode_2(
13883                &mut u[offset..],
13884                p,
13885                bucket2,
13886                fastbits,
13887                shift,
13888                r_usize,
13889                &mut i0,
13890                &mut i1,
13891                remainder >> 1,
13892            );
13893            unbwt_decode_1(
13894                &mut u[offset + 2 * (remainder >> 1)..],
13895                p,
13896                bucket2,
13897                fastbits,
13898                shift,
13899                &mut i0,
13900                (r_usize >> 1) - (remainder >> 1),
13901            );
13902        }
13903        3 => {
13904            let mut i0 = i[i_index] as FastUint;
13905            let mut i1 = i[i_index + 1] as FastUint;
13906            let mut i2 = i[i_index + 2] as FastUint;
13907            unbwt_decode_3(
13908                &mut u[offset..],
13909                p,
13910                bucket2,
13911                fastbits,
13912                shift,
13913                r_usize,
13914                &mut i0,
13915                &mut i1,
13916                &mut i2,
13917                remainder >> 1,
13918            );
13919            unbwt_decode_2(
13920                &mut u[offset + 2 * (remainder >> 1)..],
13921                p,
13922                bucket2,
13923                fastbits,
13924                shift,
13925                r_usize,
13926                &mut i0,
13927                &mut i1,
13928                (r_usize >> 1) - (remainder >> 1),
13929            );
13930        }
13931        4 => {
13932            let mut i0 = i[i_index] as FastUint;
13933            let mut i1 = i[i_index + 1] as FastUint;
13934            let mut i2 = i[i_index + 2] as FastUint;
13935            let mut i3 = i[i_index + 3] as FastUint;
13936            unbwt_decode_4(
13937                &mut u[offset..],
13938                p,
13939                bucket2,
13940                fastbits,
13941                shift,
13942                r_usize,
13943                &mut i0,
13944                &mut i1,
13945                &mut i2,
13946                &mut i3,
13947                remainder >> 1,
13948            );
13949            unbwt_decode_3(
13950                &mut u[offset + 2 * (remainder >> 1)..],
13951                p,
13952                bucket2,
13953                fastbits,
13954                shift,
13955                r_usize,
13956                &mut i0,
13957                &mut i1,
13958                &mut i2,
13959                (r_usize >> 1) - (remainder >> 1),
13960            );
13961        }
13962        5 => {
13963            let mut i0 = i[i_index] as FastUint;
13964            let mut i1 = i[i_index + 1] as FastUint;
13965            let mut i2 = i[i_index + 2] as FastUint;
13966            let mut i3 = i[i_index + 3] as FastUint;
13967            let mut i4 = i[i_index + 4] as FastUint;
13968            unbwt_decode_5(
13969                &mut u[offset..],
13970                p,
13971                bucket2,
13972                fastbits,
13973                shift,
13974                r_usize,
13975                &mut i0,
13976                &mut i1,
13977                &mut i2,
13978                &mut i3,
13979                &mut i4,
13980                remainder >> 1,
13981            );
13982            unbwt_decode_4(
13983                &mut u[offset + 2 * (remainder >> 1)..],
13984                p,
13985                bucket2,
13986                fastbits,
13987                shift,
13988                r_usize,
13989                &mut i0,
13990                &mut i1,
13991                &mut i2,
13992                &mut i3,
13993                (r_usize >> 1) - (remainder >> 1),
13994            );
13995        }
13996        6 => {
13997            let mut i0 = i[i_index] as FastUint;
13998            let mut i1 = i[i_index + 1] as FastUint;
13999            let mut i2 = i[i_index + 2] as FastUint;
14000            let mut i3 = i[i_index + 3] as FastUint;
14001            let mut i4 = i[i_index + 4] as FastUint;
14002            let mut i5 = i[i_index + 5] as FastUint;
14003            unbwt_decode_6(
14004                &mut u[offset..],
14005                p,
14006                bucket2,
14007                fastbits,
14008                shift,
14009                r_usize,
14010                &mut i0,
14011                &mut i1,
14012                &mut i2,
14013                &mut i3,
14014                &mut i4,
14015                &mut i5,
14016                remainder >> 1,
14017            );
14018            unbwt_decode_5(
14019                &mut u[offset + 2 * (remainder >> 1)..],
14020                p,
14021                bucket2,
14022                fastbits,
14023                shift,
14024                r_usize,
14025                &mut i0,
14026                &mut i1,
14027                &mut i2,
14028                &mut i3,
14029                &mut i4,
14030                (r_usize >> 1) - (remainder >> 1),
14031            );
14032        }
14033        7 => {
14034            let mut i0 = i[i_index] as FastUint;
14035            let mut i1 = i[i_index + 1] as FastUint;
14036            let mut i2 = i[i_index + 2] as FastUint;
14037            let mut i3 = i[i_index + 3] as FastUint;
14038            let mut i4 = i[i_index + 4] as FastUint;
14039            let mut i5 = i[i_index + 5] as FastUint;
14040            let mut i6 = i[i_index + 6] as FastUint;
14041            unbwt_decode_7(
14042                &mut u[offset..],
14043                p,
14044                bucket2,
14045                fastbits,
14046                shift,
14047                r_usize,
14048                &mut i0,
14049                &mut i1,
14050                &mut i2,
14051                &mut i3,
14052                &mut i4,
14053                &mut i5,
14054                &mut i6,
14055                remainder >> 1,
14056            );
14057            unbwt_decode_6(
14058                &mut u[offset + 2 * (remainder >> 1)..],
14059                p,
14060                bucket2,
14061                fastbits,
14062                shift,
14063                r_usize,
14064                &mut i0,
14065                &mut i1,
14066                &mut i2,
14067                &mut i3,
14068                &mut i4,
14069                &mut i5,
14070                (r_usize >> 1) - (remainder >> 1),
14071            );
14072        }
14073        8 => {
14074            let mut i0 = i[i_index] as FastUint;
14075            let mut i1 = i[i_index + 1] as FastUint;
14076            let mut i2 = i[i_index + 2] as FastUint;
14077            let mut i3 = i[i_index + 3] as FastUint;
14078            let mut i4 = i[i_index + 4] as FastUint;
14079            let mut i5 = i[i_index + 5] as FastUint;
14080            let mut i6 = i[i_index + 6] as FastUint;
14081            let mut i7 = i[i_index + 7] as FastUint;
14082            unbwt_decode_8(
14083                &mut u[offset..],
14084                p,
14085                bucket2,
14086                fastbits,
14087                shift,
14088                r_usize,
14089                &mut i0,
14090                &mut i1,
14091                &mut i2,
14092                &mut i3,
14093                &mut i4,
14094                &mut i5,
14095                &mut i6,
14096                &mut i7,
14097                remainder >> 1,
14098            );
14099            unbwt_decode_7(
14100                &mut u[offset + 2 * (remainder >> 1)..],
14101                p,
14102                bucket2,
14103                fastbits,
14104                shift,
14105                r_usize,
14106                &mut i0,
14107                &mut i1,
14108                &mut i2,
14109                &mut i3,
14110                &mut i4,
14111                &mut i5,
14112                &mut i6,
14113                (r_usize >> 1) - (remainder >> 1),
14114            );
14115        }
14116        _ => {}
14117    }
14118}
14119
14120/// Internal helper: unbwt decode (OpenMP variant).
14121#[doc(hidden)]
14122pub fn unbwt_decode_omp(
14123    t: &[u8],
14124    u: &mut [u8],
14125    p: &[SaUint],
14126    n: SaSint,
14127    r: SaSint,
14128    i: &[SaUint],
14129    bucket2: &[SaUint],
14130    fastbits: &[u16],
14131    threads: SaSint,
14132) {
14133    let lastc = t[0];
14134    let blocks = 1 + ((n as FastSint - 1) / r as FastSint);
14135    let remainder = usize::try_from(n).expect("n must be non-negative")
14136        - usize::try_from(r).expect("r must be non-negative")
14137            * (usize::try_from(blocks).expect("blocks") - 1);
14138    let max_threads = usize::try_from(blocks.min(threads.max(1) as FastSint))
14139        .expect("thread count must fit usize");
14140    let block_stride = usize::try_from(blocks).expect("blocks must be non-negative") / max_threads;
14141    let block_remainder =
14142        usize::try_from(blocks).expect("blocks must be non-negative") % max_threads;
14143    let r_usize = usize::try_from(r).expect("r must be non-negative");
14144
14145    for thread in 0..max_threads {
14146        let block_size = block_stride + usize::from(thread < block_remainder);
14147        let block_start = block_stride * thread + thread.min(block_remainder);
14148        unbwt_decode(
14149            &mut u[r_usize * block_start..],
14150            p,
14151            n,
14152            r,
14153            &i[block_start..],
14154            bucket2,
14155            fastbits,
14156            block_size as FastSint,
14157            if thread + 1 < max_threads {
14158                r_usize
14159            } else {
14160                remainder
14161            },
14162        );
14163    }
14164    u[usize::try_from(n).expect("n must be non-negative") - 1] = lastc;
14165}
14166
14167/// Internal helper: unbwt core.
14168#[doc(hidden)]
14169pub fn unbwt_core(
14170    t: &[u8],
14171    u: &mut [u8],
14172    p: &mut [SaUint],
14173    n: SaSint,
14174    freq: Option<&[SaSint]>,
14175    r: SaSint,
14176    i: &[SaUint],
14177    bucket2: &mut [SaUint],
14178    fastbits: &mut [u16],
14179    buckets: Option<&mut [SaUint]>,
14180    threads: SaSint,
14181) -> SaSint {
14182    if threads > 1 && n >= 262_144 {
14183        unbwt_init_parallel(t, p, n, freq, i, bucket2, fastbits, buckets, threads);
14184    } else {
14185        unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
14186    }
14187
14188    unbwt_decode_omp(t, u, p, n, r, i, bucket2, fastbits, threads);
14189    0
14190}
14191
14192/// Internal helper: unbwt main.
14193#[doc(hidden)]
14194pub fn unbwt_main(
14195    t: &[u8],
14196    u: &mut [u8],
14197    p: &mut [SaUint],
14198    n: SaSint,
14199    freq: Option<&[SaSint]>,
14200    r: SaSint,
14201    i: &[SaUint],
14202    threads: SaSint,
14203) -> SaSint {
14204    let mut shift = 0usize;
14205    while (usize::try_from(n).expect("n must be non-negative") >> shift)
14206        > (1usize << UNBWT_FASTBITS)
14207    {
14208        shift += 1;
14209    }
14210
14211    let mut bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
14212    let mut fastbits =
14213        vec![0u16; 1 + (usize::try_from(n).expect("n must be non-negative") >> shift)];
14214    let mut buckets = if threads > 1 && n >= 262_144 {
14215        Some(vec![
14216            0u64;
14217            usize::try_from(threads)
14218                .expect("threads must be non-negative")
14219                * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)
14220        ])
14221    } else {
14222        None
14223    };
14224
14225    unbwt_core(
14226        t,
14227        u,
14228        p,
14229        n,
14230        freq,
14231        r,
14232        i,
14233        &mut bucket2,
14234        &mut fastbits,
14235        buckets.as_deref_mut(),
14236        threads,
14237    )
14238}
14239
14240/// Internal helper: unbwt main ctx.
14241#[doc(hidden)]
14242pub fn unbwt_main_ctx(
14243    ctx: &mut UnbwtContext,
14244    t: &[u8],
14245    u: &mut [u8],
14246    p: &mut [SaUint],
14247    n: SaSint,
14248    freq: Option<&[SaSint]>,
14249    r: SaSint,
14250    i: &[SaUint],
14251) -> SaSint {
14252    if ctx.threads <= 0 {
14253        return -2;
14254    }
14255    let mut shift = 0usize;
14256    while (usize::try_from(n).expect("n must be non-negative") >> shift)
14257        > (1usize << UNBWT_FASTBITS)
14258    {
14259        shift += 1;
14260    }
14261    let required_fastbits = 1 + (usize::try_from(n).expect("n must be non-negative") >> shift);
14262    if ctx.bucket2.len() < ALPHABET_SIZE * ALPHABET_SIZE
14263        || ctx.fastbits.len() < required_fastbits
14264        || (ctx.threads > 1 && ctx.buckets.is_none())
14265    {
14266        return -2;
14267    }
14268
14269    unbwt_core(
14270        t,
14271        u,
14272        p,
14273        n,
14274        freq,
14275        r,
14276        i,
14277        &mut ctx.bucket2,
14278        &mut ctx.fastbits,
14279        ctx.buckets.as_deref_mut(),
14280        ctx.threads as SaSint,
14281    )
14282}
14283
14284/// Reconstructs the original string from a given BWT and primary index.
14285///
14286/// - `t` (`[0..n-1]`): the input string.
14287/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14288/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14289/// - `freq` (`[0..255]`): optional input symbol frequency table.
14290/// - `i`: the primary index.
14291///
14292/// Returns 0 on success, -1 or -2 on error.
14293pub fn libsais64_unbwt(
14294    t: &[u8],
14295    u: &mut [u8],
14296    a: &mut [SaSint],
14297    freq: Option<&[SaSint]>,
14298    i: SaSint,
14299) -> SaSint {
14300    libsais64_unbwt_aux(
14301        t,
14302        u,
14303        a,
14304        freq,
14305        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14306        &[i],
14307    )
14308}
14309
14310/// Reconstructs the original string from a given BWT and primary index using a libsais64 reverse-BWT context.
14311///
14312/// - `ctx`: the libsais64 reverse-BWT context.
14313/// - `t` (`[0..n-1]`): the input string.
14314/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14315/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14316/// - `freq` (`[0..255]`): optional input symbol frequency table.
14317/// - `i`: the primary index.
14318///
14319/// Returns 0 on success, -1 or -2 on error.
14320pub fn libsais64_unbwt_ctx(
14321    ctx: &mut UnbwtContext,
14322    t: &[u8],
14323    u: &mut [u8],
14324    a: &mut [SaSint],
14325    freq: Option<&[SaSint]>,
14326    i: SaSint,
14327) -> SaSint {
14328    libsais64_unbwt_aux_ctx(
14329        ctx,
14330        t,
14331        u,
14332        a,
14333        freq,
14334        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14335        &[i],
14336    )
14337}
14338
14339/// Reconstructs the original string from a given BWT with auxiliary indexes.
14340///
14341/// - `t` (`[0..n-1]`): the input string.
14342/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14343/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14344/// - `freq` (`[0..255]`): optional input symbol frequency table.
14345/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14346/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14347///
14348/// Returns 0 on success, -1 or -2 on error.
14349pub fn libsais64_unbwt_aux(
14350    t: &[u8],
14351    u: &mut [u8],
14352    a: &mut [SaSint],
14353    freq: Option<&[SaSint]>,
14354    r: SaSint,
14355    i: &[SaSint],
14356) -> SaSint {
14357    let t_len = t.len();
14358    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14359    if u.len() < t_len
14360        || a.len() < t_len
14361        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14362        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14363    {
14364        return -1;
14365    }
14366    let sample_count = if n == 0 {
14367        1
14368    } else {
14369        ((n - 1) / r + 1) as usize
14370    };
14371    if i.len() < sample_count {
14372        return -1;
14373    }
14374
14375    if n <= 1 {
14376        if i[0] != n {
14377            return -1;
14378        }
14379        if n == 1 {
14380            u[0] = t[0];
14381        }
14382        return 0;
14383    }
14384
14385    for t in 0..sample_count {
14386        let sample = i[t];
14387        if sample <= 0 || sample > n {
14388            return -1;
14389        }
14390    }
14391
14392    let i_uint: Vec<SaUint> = i
14393        .iter()
14394        .take(sample_count)
14395        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14396        .collect();
14397    let mut p = vec![0u64; t_len + 1];
14398    let result = unbwt_main(t, u, &mut p, n, freq, r, &i_uint, 1);
14399    for t in 0..t_len {
14400        a[t] = p[t] as SaSint;
14401    }
14402    result
14403}
14404
14405/// Reconstructs the original string from a given BWT with auxiliary indexes using a libsais64 reverse-BWT context.
14406///
14407/// - `ctx`: the libsais64 reverse-BWT context.
14408/// - `t` (`[0..n-1]`): the input string.
14409/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14410/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14411/// - `freq` (`[0..255]`): optional input symbol frequency table.
14412/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14413/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14414///
14415/// Returns 0 on success, -1 or -2 on error.
14416pub fn libsais64_unbwt_aux_ctx(
14417    ctx: &mut UnbwtContext,
14418    t: &[u8],
14419    u: &mut [u8],
14420    a: &mut [SaSint],
14421    freq: Option<&[SaSint]>,
14422    r: SaSint,
14423    i: &[SaSint],
14424) -> SaSint {
14425    let t_len = t.len();
14426    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14427    if u.len() < t_len
14428        || a.len() < t_len
14429        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14430        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14431    {
14432        return -1;
14433    }
14434    let sample_count = if n == 0 {
14435        1
14436    } else {
14437        ((n - 1) / r + 1) as usize
14438    };
14439    if i.len() < sample_count {
14440        return -1;
14441    }
14442
14443    if n <= 1 {
14444        if i[0] != n {
14445            return -1;
14446        }
14447        if n == 1 {
14448            u[0] = t[0];
14449        }
14450        return 0;
14451    }
14452
14453    for t in 0..sample_count {
14454        let sample = i[t];
14455        if sample <= 0 || sample > n {
14456            return -1;
14457        }
14458    }
14459
14460    let i_uint: Vec<SaUint> = i
14461        .iter()
14462        .take(sample_count)
14463        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14464        .collect();
14465    let mut p = vec![0u64; t_len + 1];
14466    let result = unbwt_main_ctx(ctx, t, u, &mut p, n, freq, r, &i_uint);
14467    for t in 0..t_len {
14468        a[t] = p[t] as SaSint;
14469    }
14470    result
14471}
14472
14473/// Creates the libsais64 reverse-BWT context for parallel `libsais64_unbwt_*` operations using OpenMP-style threading.
14474///
14475/// In multi-threaded environments, use one context per thread for parallel executions.
14476///
14477/// - `threads`: number of worker threads (can be 0 for the implementation default).
14478///
14479/// Returns the context, or `None` on allocation failure.
14480pub fn unbwt_create_ctx_omp(threads: SaSint) -> Option<UnbwtContext> {
14481    if threads < 0 {
14482        return None;
14483    }
14484    unbwt_create_ctx_main(normalize_omp_threads(threads))
14485}
14486
14487/// Reconstructs the original string from a given BWT and primary index in parallel using OpenMP-style threading.
14488///
14489/// - `t` (`[0..n-1]`): the input string.
14490/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14491/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14492/// - `freq` (`[0..255]`): optional input symbol frequency table.
14493/// - `i`: the primary index.
14494/// - `threads`: number of worker threads (can be 0 for the implementation default).
14495///
14496/// Returns 0 on success, -1 or -2 on error.
14497pub fn libsais64_unbwt_omp(
14498    t: &[u8],
14499    u: &mut [u8],
14500    a: &mut [SaSint],
14501    freq: Option<&[SaSint]>,
14502    i: SaSint,
14503    threads: SaSint,
14504) -> SaSint {
14505    libsais64_unbwt_aux_omp(
14506        t,
14507        u,
14508        a,
14509        freq,
14510        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14511        &[i],
14512        threads,
14513    )
14514}
14515
14516/// Reconstructs the original string from a given BWT with auxiliary indexes in parallel using OpenMP-style threading.
14517///
14518/// - `t` (`[0..n-1]`): the input string.
14519/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14520/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14521/// - `freq` (`[0..255]`): optional input symbol frequency table.
14522/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14523/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14524/// - `threads`: number of worker threads (can be 0 for the implementation default).
14525///
14526/// Returns 0 on success, -1 or -2 on error.
14527pub fn libsais64_unbwt_aux_omp(
14528    t: &[u8],
14529    u: &mut [u8],
14530    a: &mut [SaSint],
14531    freq: Option<&[SaSint]>,
14532    r: SaSint,
14533    i: &[SaSint],
14534    threads: SaSint,
14535) -> SaSint {
14536    let t_len = t.len();
14537    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14538    if threads < 0
14539        || u.len() < t_len
14540        || a.len() < t_len
14541        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14542        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14543    {
14544        return -1;
14545    }
14546    let sample_count = if n == 0 {
14547        1
14548    } else {
14549        ((n - 1) / r + 1) as usize
14550    };
14551    if i.len() < sample_count {
14552        return -1;
14553    }
14554
14555    if n <= 1 {
14556        if i[0] != n {
14557            return -1;
14558        }
14559        if n == 1 {
14560            u[0] = t[0];
14561        }
14562        return 0;
14563    }
14564
14565    for sample in i.iter().take(sample_count) {
14566        let sample = *sample;
14567        if sample <= 0 || sample > n {
14568            return -1;
14569        }
14570    }
14571
14572    let threads = if threads > 0 { threads } else { 1 };
14573    let i_uint: Vec<SaUint> = i
14574        .iter()
14575        .take(sample_count)
14576        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14577        .collect();
14578    let mut p = vec![0u64; t_len + 1];
14579    let result = unbwt_main(t, u, &mut p, n, freq, r, &i_uint, threads);
14580    for idx in 0..t_len {
14581        a[idx] = p[idx] as SaSint;
14582    }
14583    result
14584}
14585
14586/// Internal helper: bwt copy 8u.
14587#[doc(hidden)]
14588pub fn bwt_copy_8u(u: &mut [u8], a: &[SaSint], n: SaSint) {
14589    if n <= 0 {
14590        return;
14591    }
14592
14593    let n_usize = usize::try_from(n).expect("n must be non-negative");
14594    for i in 0..n_usize {
14595        u[i] = a[i] as u8;
14596    }
14597}
14598
14599/// Internal helper: bwt copy 8u (OpenMP variant).
14600#[doc(hidden)]
14601pub fn bwt_copy_8u_omp(u: &mut [u8], a: &[SaSint], n: SaSint, threads: SaSint) {
14602    if threads == 1 || n < 65_536 {
14603        bwt_copy_8u(u, a, n);
14604        return;
14605    }
14606
14607    let n_usize = usize::try_from(n).expect("n must be non-negative");
14608    assert!(u.len() >= n_usize);
14609    assert!(a.len() >= n_usize);
14610    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
14611    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
14612    let a_ptr = a.as_ptr() as usize;
14613    run_rayon_with_threads(threads_usize, || {
14614        u[..n_usize]
14615            .par_chunks_mut(chunk_size)
14616            .enumerate()
14617            .for_each(|(chunk_index, chunk)| {
14618                let start = chunk_index * chunk_size;
14619                let dst_ptr = chunk.as_mut_ptr();
14620                let src_ptr = unsafe { (a_ptr as *const SaSint).add(start) };
14621                for offset in 0..chunk.len() {
14622                    unsafe {
14623                        *dst_ptr.add(offset) = *src_ptr.add(offset) as u8;
14624                    }
14625                }
14626            });
14627    });
14628}
14629
14630/// Internal helper: accumulate counts s32 2.
14631#[doc(hidden)]
14632pub fn accumulate_counts_s32_2(bucket00: &mut [SaSint], bucket01: &[SaSint]) {
14633    assert_eq!(bucket00.len(), bucket01.len());
14634    for (dst, src) in bucket00.iter_mut().zip(bucket01.iter()) {
14635        *dst += *src;
14636    }
14637}
14638
14639/// Internal helper: accumulate counts s32 3.
14640#[doc(hidden)]
14641pub fn accumulate_counts_s32_3(bucket00: &mut [SaSint], bucket01: &[SaSint], bucket02: &[SaSint]) {
14642    assert_eq!(bucket00.len(), bucket01.len());
14643    assert_eq!(bucket00.len(), bucket02.len());
14644    for ((dst, src1), src2) in bucket00
14645        .iter_mut()
14646        .zip(bucket01.iter())
14647        .zip(bucket02.iter())
14648    {
14649        *dst += *src1 + *src2;
14650    }
14651}
14652
14653/// Internal helper: accumulate counts s32 4.
14654#[doc(hidden)]
14655pub fn accumulate_counts_s32_4(
14656    bucket00: &mut [SaSint],
14657    bucket01: &[SaSint],
14658    bucket02: &[SaSint],
14659    bucket03: &[SaSint],
14660) {
14661    assert_eq!(bucket00.len(), bucket01.len());
14662    assert_eq!(bucket00.len(), bucket02.len());
14663    assert_eq!(bucket00.len(), bucket03.len());
14664    for (((dst, src1), src2), src3) in bucket00
14665        .iter_mut()
14666        .zip(bucket01.iter())
14667        .zip(bucket02.iter())
14668        .zip(bucket03.iter())
14669    {
14670        *dst += *src1 + *src2 + *src3;
14671    }
14672}
14673
14674/// Internal helper: accumulate counts s32 5.
14675#[doc(hidden)]
14676pub fn accumulate_counts_s32_5(
14677    bucket00: &mut [SaSint],
14678    bucket01: &[SaSint],
14679    bucket02: &[SaSint],
14680    bucket03: &[SaSint],
14681    bucket04: &[SaSint],
14682) {
14683    assert_eq!(bucket00.len(), bucket01.len());
14684    assert_eq!(bucket00.len(), bucket02.len());
14685    assert_eq!(bucket00.len(), bucket03.len());
14686    assert_eq!(bucket00.len(), bucket04.len());
14687    for ((((dst, src1), src2), src3), src4) in bucket00
14688        .iter_mut()
14689        .zip(bucket01.iter())
14690        .zip(bucket02.iter())
14691        .zip(bucket03.iter())
14692        .zip(bucket04.iter())
14693    {
14694        *dst += *src1 + *src2 + *src3 + *src4;
14695    }
14696}
14697
14698/// Internal helper: accumulate counts s32 6.
14699#[doc(hidden)]
14700pub fn accumulate_counts_s32_6(
14701    bucket00: &mut [SaSint],
14702    bucket01: &[SaSint],
14703    bucket02: &[SaSint],
14704    bucket03: &[SaSint],
14705    bucket04: &[SaSint],
14706    bucket05: &[SaSint],
14707) {
14708    assert_eq!(bucket00.len(), bucket01.len());
14709    assert_eq!(bucket00.len(), bucket02.len());
14710    assert_eq!(bucket00.len(), bucket03.len());
14711    assert_eq!(bucket00.len(), bucket04.len());
14712    assert_eq!(bucket00.len(), bucket05.len());
14713    for (((((dst, src1), src2), src3), src4), src5) in bucket00
14714        .iter_mut()
14715        .zip(bucket01.iter())
14716        .zip(bucket02.iter())
14717        .zip(bucket03.iter())
14718        .zip(bucket04.iter())
14719        .zip(bucket05.iter())
14720    {
14721        *dst += *src1 + *src2 + *src3 + *src4 + *src5;
14722    }
14723}
14724
14725/// Internal helper: accumulate counts s32 7.
14726#[doc(hidden)]
14727pub fn accumulate_counts_s32_7(
14728    bucket00: &mut [SaSint],
14729    bucket01: &[SaSint],
14730    bucket02: &[SaSint],
14731    bucket03: &[SaSint],
14732    bucket04: &[SaSint],
14733    bucket05: &[SaSint],
14734    bucket06: &[SaSint],
14735) {
14736    assert_eq!(bucket00.len(), bucket01.len());
14737    assert_eq!(bucket00.len(), bucket02.len());
14738    assert_eq!(bucket00.len(), bucket03.len());
14739    assert_eq!(bucket00.len(), bucket04.len());
14740    assert_eq!(bucket00.len(), bucket05.len());
14741    assert_eq!(bucket00.len(), bucket06.len());
14742    for ((((((dst, src1), src2), src3), src4), src5), src6) in bucket00
14743        .iter_mut()
14744        .zip(bucket01.iter())
14745        .zip(bucket02.iter())
14746        .zip(bucket03.iter())
14747        .zip(bucket04.iter())
14748        .zip(bucket05.iter())
14749        .zip(bucket06.iter())
14750    {
14751        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6;
14752    }
14753}
14754
14755/// Internal helper: accumulate counts s32 8.
14756#[doc(hidden)]
14757pub fn accumulate_counts_s32_8(
14758    bucket00: &mut [SaSint],
14759    bucket01: &[SaSint],
14760    bucket02: &[SaSint],
14761    bucket03: &[SaSint],
14762    bucket04: &[SaSint],
14763    bucket05: &[SaSint],
14764    bucket06: &[SaSint],
14765    bucket07: &[SaSint],
14766) {
14767    assert_eq!(bucket00.len(), bucket01.len());
14768    assert_eq!(bucket00.len(), bucket02.len());
14769    assert_eq!(bucket00.len(), bucket03.len());
14770    assert_eq!(bucket00.len(), bucket04.len());
14771    assert_eq!(bucket00.len(), bucket05.len());
14772    assert_eq!(bucket00.len(), bucket06.len());
14773    assert_eq!(bucket00.len(), bucket07.len());
14774    for (((((((dst, src1), src2), src3), src4), src5), src6), src7) in bucket00
14775        .iter_mut()
14776        .zip(bucket01.iter())
14777        .zip(bucket02.iter())
14778        .zip(bucket03.iter())
14779        .zip(bucket04.iter())
14780        .zip(bucket05.iter())
14781        .zip(bucket06.iter())
14782        .zip(bucket07.iter())
14783    {
14784        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6 + *src7;
14785    }
14786}
14787
14788/// Internal helper: accumulate counts s32 9.
14789#[doc(hidden)]
14790pub fn accumulate_counts_s32_9(
14791    bucket00: &mut [SaSint],
14792    bucket01: &[SaSint],
14793    bucket02: &[SaSint],
14794    bucket03: &[SaSint],
14795    bucket04: &[SaSint],
14796    bucket05: &[SaSint],
14797    bucket06: &[SaSint],
14798    bucket07: &[SaSint],
14799    bucket08: &[SaSint],
14800) {
14801    assert_eq!(bucket00.len(), bucket01.len());
14802    assert_eq!(bucket00.len(), bucket02.len());
14803    assert_eq!(bucket00.len(), bucket03.len());
14804    assert_eq!(bucket00.len(), bucket04.len());
14805    assert_eq!(bucket00.len(), bucket05.len());
14806    assert_eq!(bucket00.len(), bucket06.len());
14807    assert_eq!(bucket00.len(), bucket07.len());
14808    assert_eq!(bucket00.len(), bucket08.len());
14809    for ((((((((dst, src1), src2), src3), src4), src5), src6), src7), src8) in bucket00
14810        .iter_mut()
14811        .zip(bucket01.iter())
14812        .zip(bucket02.iter())
14813        .zip(bucket03.iter())
14814        .zip(bucket04.iter())
14815        .zip(bucket05.iter())
14816        .zip(bucket06.iter())
14817        .zip(bucket07.iter())
14818        .zip(bucket08.iter())
14819    {
14820        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6 + *src7 + *src8;
14821    }
14822}
14823
14824/// Internal helper: accumulate counts s32.
14825#[doc(hidden)]
14826pub fn accumulate_counts_s32(
14827    buckets: &mut [SaSint],
14828    bucket_size: FastSint,
14829    bucket_stride: FastSint,
14830    mut num_buckets: FastSint,
14831) {
14832    if num_buckets <= 1 {
14833        return;
14834    }
14835
14836    let bucket_size = usize::try_from(bucket_size).expect("bucket_size must be non-negative");
14837    let bucket_stride = usize::try_from(bucket_stride).expect("bucket_stride must be non-negative");
14838    let num_buckets_usize = usize::try_from(num_buckets).expect("num_buckets must be non-negative");
14839    assert!(buckets.len() >= bucket_size + (num_buckets_usize - 1) * bucket_stride);
14840    let bucket00_start = (num_buckets_usize - 1) * bucket_stride;
14841
14842    while num_buckets >= 9 {
14843        let start = bucket00_start
14844            - usize::try_from(num_buckets - 9).expect("non-negative") * bucket_stride;
14845        accumulate_counts_at(buckets, start, bucket_size, bucket_stride, 9);
14846        num_buckets -= 8;
14847    }
14848
14849    match num_buckets {
14850        1 => {}
14851        2..=8 => accumulate_counts_at(
14852            buckets,
14853            bucket00_start,
14854            bucket_size,
14855            bucket_stride,
14856            usize::try_from(num_buckets).expect("non-negative"),
14857        ),
14858        _ => {}
14859    }
14860}
14861
14862fn block_slice<T>(slice: &[T], block_start: FastSint, block_size: FastSint) -> &[T] {
14863    let start = usize::try_from(block_start).expect("block_start must be non-negative");
14864    let len = usize::try_from(block_size).expect("block_size must be non-negative");
14865    &slice[start..start + len]
14866}
14867
14868#[allow(dead_code)]
14869struct SharedMutArray<'a> {
14870    ptr: *mut SaSint,
14871    len: usize,
14872    _marker: PhantomData<&'a mut [SaSint]>,
14873}
14874
14875#[allow(dead_code)]
14876impl<'a> SharedMutArray<'a> {
14877    fn new(slice: &'a mut [SaSint]) -> Self {
14878        Self {
14879            ptr: slice.as_mut_ptr(),
14880            len: slice.len(),
14881            _marker: PhantomData,
14882        }
14883    }
14884
14885    fn len(&self) -> usize {
14886        self.len
14887    }
14888
14889    fn slice_mut(&mut self, start: usize, len: usize) -> &mut [SaSint] {
14890        assert!(start <= self.len);
14891        assert!(len <= self.len - start);
14892        unsafe {
14893            // The recursive driver aliases multiple logical views into one SA backing store.
14894            // This helper centralizes that checked projection so the driver can be translated
14895            // without pretending those regions are independent Rust slices.
14896            std::slice::from_raw_parts_mut(self.ptr.add(start), len)
14897        }
14898    }
14899}
14900
14901fn accumulate_counts_at(
14902    buckets: &mut [SaSint],
14903    bucket00_start: usize,
14904    bucket_size: usize,
14905    bucket_stride: usize,
14906    count: usize,
14907) {
14908    assert!((2..=9).contains(&count));
14909    assert!(bucket00_start >= (count - 1) * bucket_stride);
14910
14911    let dst_end = bucket00_start + bucket_size;
14912    let mut sums = vec![0; bucket_size];
14913
14914    for i in 0..count {
14915        let start = bucket00_start - i * bucket_stride;
14916        let end = start + bucket_size;
14917        for (sum, value) in sums.iter_mut().zip(buckets[start..end].iter()) {
14918            *sum += *value;
14919        }
14920    }
14921
14922    buckets[bucket00_start..dst_end].copy_from_slice(&sums);
14923}
14924
14925/// Internal helper: thread state size.
14926#[doc(hidden)]
14927pub fn thread_state_size() -> usize {
14928    mem::size_of::<ThreadState>()
14929}
14930
14931#[cfg(all(test, feature = "upstream-c"))]
14932mod tests {
14933    use super::*;
14934
14935    unsafe extern "C" {
14936        fn probe_public_libsais64(t: *const u8, sa: *mut SaSint, n: SaSint, fs: SaSint) -> SaSint;
14937        fn probe_public_libsais64_freq(
14938            t: *const u8,
14939            sa: *mut SaSint,
14940            n: SaSint,
14941            fs: SaSint,
14942            freq: *mut SaSint,
14943        ) -> SaSint;
14944        fn probe_public_libsais64_gsa(
14945            t: *const u8,
14946            sa: *mut SaSint,
14947            n: SaSint,
14948            fs: SaSint,
14949        ) -> SaSint;
14950        fn probe_public_libsais64_gsa_freq(
14951            t: *const u8,
14952            sa: *mut SaSint,
14953            n: SaSint,
14954            fs: SaSint,
14955            freq: *mut SaSint,
14956        ) -> SaSint;
14957        fn probe_public_libsais64_long(
14958            t: *mut SaSint,
14959            sa: *mut SaSint,
14960            n: SaSint,
14961            k: SaSint,
14962            fs: SaSint,
14963        ) -> SaSint;
14964        fn probe_public_libsais64_bwt(
14965            t: *const u8,
14966            u: *mut u8,
14967            a: *mut SaSint,
14968            n: SaSint,
14969            fs: SaSint,
14970        ) -> SaSint;
14971        fn probe_public_libsais64_bwt_freq(
14972            t: *const u8,
14973            u: *mut u8,
14974            a: *mut SaSint,
14975            n: SaSint,
14976            fs: SaSint,
14977            freq: *mut SaSint,
14978        ) -> SaSint;
14979        fn probe_public_libsais64_bwt_aux(
14980            t: *const u8,
14981            u: *mut u8,
14982            a: *mut SaSint,
14983            n: SaSint,
14984            fs: SaSint,
14985            r: SaSint,
14986            i: *mut SaSint,
14987        ) -> SaSint;
14988        fn probe_public_libsais64_bwt_aux_freq(
14989            t: *const u8,
14990            u: *mut u8,
14991            a: *mut SaSint,
14992            n: SaSint,
14993            fs: SaSint,
14994            freq: *mut SaSint,
14995            r: SaSint,
14996            i: *mut SaSint,
14997        ) -> SaSint;
14998        fn probe_public_libsais64_unbwt(
14999            t: *const u8,
15000            u: *mut u8,
15001            a: *mut SaSint,
15002            n: SaSint,
15003            i: SaSint,
15004        ) -> SaSint;
15005        fn probe_public_libsais64_unbwt_freq(
15006            t: *const u8,
15007            u: *mut u8,
15008            a: *mut SaSint,
15009            n: SaSint,
15010            freq: *const SaSint,
15011            i: SaSint,
15012        ) -> SaSint;
15013        fn probe_public_libsais64_unbwt_aux(
15014            t: *const u8,
15015            u: *mut u8,
15016            a: *mut SaSint,
15017            n: SaSint,
15018            r: SaSint,
15019            i: *const SaSint,
15020        ) -> SaSint;
15021        fn probe_public_libsais64_unbwt_aux_freq(
15022            t: *const u8,
15023            u: *mut u8,
15024            a: *mut SaSint,
15025            n: SaSint,
15026            freq: *const SaSint,
15027            r: SaSint,
15028            i: *const SaSint,
15029        ) -> SaSint;
15030        fn probe_public_libsais64_plcp(
15031            t: *const u8,
15032            sa: *const SaSint,
15033            plcp: *mut SaSint,
15034            n: SaSint,
15035        ) -> SaSint;
15036        fn probe_public_libsais64_plcp_gsa(
15037            t: *const u8,
15038            sa: *const SaSint,
15039            plcp: *mut SaSint,
15040            n: SaSint,
15041        ) -> SaSint;
15042        fn probe_public_libsais64_lcp(
15043            plcp: *const SaSint,
15044            sa: *const SaSint,
15045            lcp: *mut SaSint,
15046            n: SaSint,
15047        ) -> SaSint;
15048        fn probe_libsais64_renumber_lms_suffixes_8u(
15049            sa: *mut SaSint,
15050            m: SaSint,
15051            name: SaSint,
15052            omp_block_start: FastSint,
15053            omp_block_size: FastSint,
15054        ) -> SaSint;
15055        fn probe_libsais64_gather_marked_lms_suffixes(
15056            sa: *mut SaSint,
15057            m: SaSint,
15058            l: FastSint,
15059            omp_block_start: FastSint,
15060            omp_block_size: FastSint,
15061        ) -> FastSint;
15062        fn probe_libsais64_renumber_and_gather_lms_suffixes_omp(
15063            sa: *mut SaSint,
15064            n: SaSint,
15065            m: SaSint,
15066            fs: SaSint,
15067            threads: SaSint,
15068        ) -> SaSint;
15069        fn probe_libsais64_renumber_distinct_lms_suffixes_32s_4k(
15070            sa: *mut SaSint,
15071            m: SaSint,
15072            name: SaSint,
15073            omp_block_start: FastSint,
15074            omp_block_size: FastSint,
15075        ) -> SaSint;
15076        fn probe_libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
15077            sa: *mut SaSint,
15078            n: SaSint,
15079            m: SaSint,
15080            threads: SaSint,
15081        ) -> SaSint;
15082        fn probe_libsais64_renumber_unique_and_nonunique_lms_suffixes_32s(
15083            t: *mut SaSint,
15084            sa: *mut SaSint,
15085            m: SaSint,
15086            f: SaSint,
15087            omp_block_start: FastSint,
15088            omp_block_size: FastSint,
15089        ) -> SaSint;
15090        fn probe_libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
15091            t: *mut SaSint,
15092            sa: *mut SaSint,
15093            m: SaSint,
15094            threads: SaSint,
15095        ) -> SaSint;
15096    }
15097
15098    #[test]
15099    fn libsais64_align_up_matches_power_of_two_alignment() {
15100        assert_eq!(align_up(0, 4096), 0);
15101        assert_eq!(align_up(1, 4096), 4096);
15102        assert_eq!(align_up(4095, 4096), 4096);
15103        assert_eq!(align_up(4096, 4096), 4096);
15104        assert_eq!(align_up(4097, 4096), 8192);
15105        assert_eq!(align_up(65, 64), 128);
15106    }
15107
15108    #[test]
15109    fn libsais64_shared_mut_array_projects_mutable_spans_from_one_backing_buffer() {
15110        let mut backing = vec![1, 2, 3, 4, 5, 6];
15111        let len;
15112        {
15113            let mut shared = SharedMutArray::new(&mut backing);
15114            shared.slice_mut(1, 3).copy_from_slice(&[20, 30, 40]);
15115            shared.slice_mut(4, 2).copy_from_slice(&[50, 60]);
15116            len = shared.len();
15117        }
15118        assert_eq!(backing, vec![1, 20, 30, 40, 50, 60]);
15119        assert_eq!(len, 6);
15120    }
15121
15122    #[test]
15123    fn libsais64_create_ctx_main_matches_single_thread_layout() {
15124        let ctx = create_ctx_main(1).expect("context");
15125        assert_eq!(ctx.buckets.len(), 8 * ALPHABET_SIZE);
15126        assert_eq!(ctx.threads, 1);
15127        assert!(ctx.thread_state.is_none());
15128    }
15129
15130    #[test]
15131    fn libsais64_create_ctx_main_allocates_thread_state_for_multi_threaded_mode() {
15132        let ctx = create_ctx_main(3).expect("context");
15133        let states = ctx.thread_state.expect("thread state");
15134        assert_eq!(states.len(), 3);
15135        assert!(states
15136            .iter()
15137            .all(|state| state.buckets.len() == 4 * ALPHABET_SIZE));
15138        assert!(states
15139            .iter()
15140            .all(|state| state.cache.len() == LIBSAIS_PER_THREAD_CACHE_SIZE));
15141    }
15142
15143    #[test]
15144    fn libsais64_create_ctx_wraps_single_thread_main_context() {
15145        let ctx = create_ctx().expect("context");
15146        assert_eq!(ctx.threads, 1);
15147        assert_eq!(ctx.buckets.len(), 8 * ALPHABET_SIZE);
15148        assert!(ctx.thread_state.is_none());
15149    }
15150
15151    #[test]
15152    fn libsais64_free_ctx_accepts_context_value() {
15153        let ctx = create_ctx().expect("context");
15154        free_ctx(ctx);
15155    }
15156
15157    #[test]
15158    fn libsais64_unbwt_create_ctx_main_allocates_expected_buffers() {
15159        let ctx = unbwt_create_ctx_main(3).expect("context");
15160        assert_eq!(ctx.bucket2.len(), ALPHABET_SIZE * ALPHABET_SIZE);
15161        assert_eq!(ctx.fastbits.len(), 1 + (1 << UNBWT_FASTBITS));
15162        assert_eq!(
15163            ctx.buckets.as_ref().expect("parallel buckets").len(),
15164            3 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)
15165        );
15166        assert_eq!(ctx.threads, 3);
15167    }
15168
15169    #[test]
15170    fn libsais64_unbwt_compute_histogram_counts_bytes() {
15171        let t = b"banana";
15172        let mut count = vec![0u64; ALPHABET_SIZE];
15173        unbwt_compute_histogram(t, t.len() as FastSint, &mut count);
15174        assert_eq!(count[b'a' as usize], 3);
15175        assert_eq!(count[b'b' as usize], 1);
15176        assert_eq!(count[b'n' as usize], 2);
15177    }
15178
15179    #[test]
15180    fn libsais64_unbwt_transpose_bucket2_swaps_matrix_entries() {
15181        let mut bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15182        bucket2[(2 << 8) + 1] = 7;
15183        bucket2[(1 << 8) + 2] = 9;
15184        unbwt_transpose_bucket2(&mut bucket2);
15185        assert_eq!(bucket2[(1 << 8) + 2], 7);
15186        assert_eq!(bucket2[(2 << 8) + 1], 9);
15187    }
15188
15189    #[test]
15190    fn libsais64_unbwt_init_single_builds_monotone_fastbits_and_writes_psi() {
15191        let t = b"annb\x00aa";
15192        let mut p = vec![0u64; t.len() + 1];
15193        let mut bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15194        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15195        let i = vec![4u64];
15196
15197        unbwt_init_single(
15198            t,
15199            &mut p,
15200            t.len() as SaSint,
15201            None,
15202            &i,
15203            &mut bucket2,
15204            &mut fastbits,
15205        );
15206
15207        assert!(fastbits
15208            .iter()
15209            .all(|&value| usize::from(value) < ALPHABET_SIZE * ALPHABET_SIZE));
15210        assert!(fastbits.iter().any(|&value| value != 0));
15211        assert!(p.iter().any(|&value| value != 0));
15212    }
15213
15214    #[test]
15215    fn libsais64_unbwt_init_parallel_currently_matches_single_initializer() {
15216        let t = b"annb\x00aa";
15217        let mut p_single = vec![0u64; t.len() + 1];
15218        let mut p_parallel = vec![0u64; t.len() + 1];
15219        let mut bucket2_single = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15220        let mut bucket2_parallel = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15221        let mut fastbits_single = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15222        let mut fastbits_parallel = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15223        let i = vec![4u64];
15224        let mut scratch = vec![0u64; 2 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)];
15225
15226        unbwt_init_single(
15227            t,
15228            &mut p_single,
15229            t.len() as SaSint,
15230            None,
15231            &i,
15232            &mut bucket2_single,
15233            &mut fastbits_single,
15234        );
15235        unbwt_init_parallel(
15236            t,
15237            &mut p_parallel,
15238            t.len() as SaSint,
15239            None,
15240            &i,
15241            &mut bucket2_parallel,
15242            &mut fastbits_parallel,
15243            Some(&mut scratch),
15244            2,
15245        );
15246
15247        assert_eq!(p_parallel, p_single);
15248        assert_eq!(bucket2_parallel, bucket2_single);
15249        assert_eq!(fastbits_parallel, fastbits_single);
15250    }
15251
15252    #[test]
15253    fn libsais64_unbwt_decode_1_writes_big_endian_symbol_words() {
15254        let mut u = vec![0u8; 4];
15255        let p = vec![1u64, 0u64];
15256        let mut bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15257        bucket2[0x1234] = 0;
15258        bucket2[0x1235] = 2;
15259        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15260        fastbits[0] = 0x1234;
15261        let mut i0 = 0usize;
15262
15263        unbwt_decode_1(&mut u, &p, &bucket2, &fastbits, 0, &mut i0, 2);
15264
15265        assert_eq!(u, vec![0x12, 0x35, 0x12, 0x35]);
15266        assert_eq!(i0, 0);
15267    }
15268
15269    #[test]
15270    fn libsais64_unbwt_decode_dispatches_two_block_tail_shape() {
15271        let mut u = vec![0u8; 8];
15272        let p = vec![1u64, 0u64];
15273        let mut bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15274        bucket2[0x1234] = 0;
15275        bucket2[0x1235] = 2;
15276        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15277        fastbits[0] = 0x1234;
15278        let i = vec![0u64, 0u64];
15279
15280        unbwt_decode(&mut u, &p, 4, 2, &i, &bucket2, &fastbits, 2, 2);
15281
15282        assert_eq!(u, vec![0x12, 0x35, 0x12, 0x35, 0x00, 0x00, 0x00, 0x00]);
15283    }
15284
15285    fn brute_force_suffix_array_u8(t: &[u8]) -> Vec<SaSint> {
15286        let mut sa: Vec<SaSint> = (0..t.len())
15287            .map(|index| SaSint::try_from(index).expect("index must fit SaSint"))
15288            .collect();
15289        sa.sort_by(|&lhs, &rhs| {
15290            t[usize::try_from(lhs).expect("non-negative")..]
15291                .cmp(&t[usize::try_from(rhs).expect("non-negative")..])
15292        });
15293        sa
15294    }
15295
15296    fn brute_force_plcp_u8(t: &[u8], sa: &[SaSint]) -> Vec<SaSint> {
15297        let mut rank = vec![0usize; t.len()];
15298        for (i, &suffix) in sa.iter().enumerate() {
15299            rank[usize::try_from(suffix).expect("suffix index must be non-negative")] = i;
15300        }
15301
15302        let mut plcp = vec![0; t.len()];
15303        for i in 0..t.len() {
15304            let r = rank[i];
15305            let prev = if r == 0 {
15306                t.len()
15307            } else {
15308                usize::try_from(sa[r - 1]).expect("suffix index must be non-negative")
15309            };
15310            if prev == t.len() {
15311                plcp[i] = 0;
15312                continue;
15313            }
15314
15315            let mut l = 0usize;
15316            while i + l < t.len() && prev + l < t.len() && t[i + l] == t[prev + l] {
15317                l += 1;
15318            }
15319            plcp[i] = l as SaSint;
15320        }
15321        plcp
15322    }
15323
15324    fn brute_force_lcp_from_sa_u8(t: &[u8], sa: &[SaSint]) -> Vec<SaSint> {
15325        let mut lcp = vec![0; sa.len()];
15326        for i in 0..sa.len() {
15327            let lhs = usize::try_from(sa[i]).expect("suffix index must be non-negative");
15328            let rhs = if i == 0 {
15329                sa.len()
15330            } else {
15331                usize::try_from(sa[i - 1]).expect("suffix index must be non-negative")
15332            };
15333            if rhs == sa.len() {
15334                lcp[i] = 0;
15335                continue;
15336            }
15337
15338            let mut l = 0usize;
15339            while lhs + l < t.len() && rhs + l < t.len() && t[lhs + l] == t[rhs + l] {
15340                l += 1;
15341            }
15342            lcp[i] = l as SaSint;
15343        }
15344        lcp
15345    }
15346
15347    fn make_libsais64_recursive_main_32s_text(repeats: usize) -> Vec<SaSint> {
15348        let motif = [9, 4, 9, 2, 9, 4, 9, 1];
15349        let mut t = Vec::with_capacity(repeats * motif.len() + 1);
15350        for _ in 0..repeats {
15351            t.extend_from_slice(&motif);
15352        }
15353        t.push(0);
15354        t
15355    }
15356
15357    fn make_libsais64_large_main_32s_stress_text(len: usize, alphabet: SaSint) -> Vec<SaSint> {
15358        let mut state: u32 = 0x1357_9bdf;
15359        let mut t = Vec::with_capacity(len + 1);
15360
15361        for i in 0..len {
15362            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
15363            let mut value = ((state >> 16) % (alphabet as u32 - 1)) as SaSint + 1;
15364
15365            if i % 17 < 8 {
15366                value = ((i / 17) as SaSint % 11) + 1;
15367            }
15368            if i % 29 < 10 {
15369                value = (((i / 29) as SaSint * 3) % 19) + 1;
15370            }
15371            if i % 64 >= 48 {
15372                value = t[i - 48];
15373            }
15374
15375            t.push(value);
15376        }
15377
15378        t.push(0);
15379        t
15380    }
15381
15382    fn assert_libsais64_main_32s_entry_matches_public_c_long(
15383        t: Vec<SaSint>,
15384        k: SaSint,
15385        fs: SaSint,
15386        compare_full_sa: bool,
15387    ) {
15388        let n = t.len() as SaSint;
15389        let n_usize = t.len();
15390        let threads = 1;
15391        let extra = usize::try_from(fs).expect("fs must be non-negative");
15392
15393        let mut c_t = t.clone();
15394        let mut c_sa = vec![0; t.len() + extra];
15395        let c_result =
15396            unsafe { probe_public_libsais64_long(c_t.as_mut_ptr(), c_sa.as_mut_ptr(), n, k, fs) };
15397
15398        let mut rust_t = t;
15399        let mut rust_sa = vec![0; rust_t.len() + extra];
15400        let mut thread_state = alloc_thread_state(threads).expect("thread state");
15401        let rust_result = libsais64_main_32s_entry(
15402            &mut rust_t,
15403            &mut rust_sa,
15404            n,
15405            k,
15406            fs,
15407            threads,
15408            &mut thread_state,
15409        );
15410
15411        assert_eq!(rust_result, c_result);
15412        assert_eq!(rust_t, c_t);
15413        if compare_full_sa {
15414            assert_eq!(rust_sa, c_sa);
15415        } else {
15416            assert_eq!(&rust_sa[..n_usize], &c_sa[..n_usize]);
15417        }
15418    }
15419
15420    fn assert_libsais64_main_32s_entry_matches_public_c_long_for_branch(k: SaSint) {
15421        assert_libsais64_main_32s_entry_matches_public_c_long(
15422            vec![17, 3, 17, 9, 5, 9, 2, 11, 2, 7, 1, 7, 0],
15423            k,
15424            0,
15425            true,
15426        );
15427    }
15428
15429    #[test]
15430    fn libsais64_matches_bruteforce_suffix_array_for_small_text() {
15431        let t = b"banana";
15432        let mut sa = vec![0; t.len()];
15433        let mut freq = vec![0; ALPHABET_SIZE];
15434
15435        let result = libsais64(t, &mut sa, 0, Some(&mut freq));
15436
15437        assert_eq!(result, 0);
15438        assert_eq!(sa, brute_force_suffix_array_u8(t));
15439        assert_eq!(freq[b'a' as usize], 3);
15440        assert_eq!(freq[b'b' as usize], 1);
15441        assert_eq!(freq[b'n' as usize], 2);
15442    }
15443
15444    #[test]
15445    fn libsais64_int_matches_bruteforce_suffix_array_for_small_integer_text() {
15446        let mut t = vec![2, 1, 3, 1, 0];
15447        let expected = {
15448            let mut sa: Vec<SaSint> = (0..t.len())
15449                .map(|index| SaSint::try_from(index).expect("index must fit SaSint"))
15450                .collect();
15451            sa.sort_by(|&lhs, &rhs| {
15452                t[usize::try_from(lhs).expect("non-negative")..]
15453                    .cmp(&t[usize::try_from(rhs).expect("non-negative")..])
15454            });
15455            sa
15456        };
15457        let mut sa = vec![0; t.len()];
15458
15459        let result = libsais64_int(&mut t, &mut sa, 4, 0);
15460
15461        assert_eq!(result, 0);
15462        assert_eq!(sa, expected);
15463    }
15464
15465    #[test]
15466    fn libsais64_plcp_matches_bruteforce_for_small_text() {
15467        let t = b"banana";
15468        let sa = brute_force_suffix_array_u8(t);
15469        let expected = brute_force_plcp_u8(t, &sa);
15470        let mut plcp = vec![0; t.len()];
15471
15472        let result = libsais64_plcp(t, &sa, &mut plcp);
15473
15474        assert_eq!(result, 0);
15475        assert_eq!(plcp, expected);
15476    }
15477
15478    #[test]
15479    fn libsais64_plcp_gsa_stops_at_separator() {
15480        let t = b"ab\0b\0";
15481        let sa = brute_force_suffix_array_u8(t);
15482        let mut plcp = vec![0; t.len()];
15483
15484        let result = libsais64_plcp_gsa(t, &sa, &mut plcp);
15485
15486        assert_eq!(result, 0);
15487        assert_eq!(plcp[2], 0);
15488        assert_eq!(plcp[4], 0);
15489    }
15490
15491    #[test]
15492    fn libsais64_lcp_matches_bruteforce_for_small_text() {
15493        let t = b"banana";
15494        let sa = brute_force_suffix_array_u8(t);
15495        let plcp = brute_force_plcp_u8(t, &sa);
15496        let expected = brute_force_lcp_from_sa_u8(t, &sa);
15497        let mut lcp = vec![0; t.len()];
15498
15499        let result = libsais64_lcp(&plcp, &sa, &mut lcp);
15500
15501        assert_eq!(result, 0);
15502        assert_eq!(lcp, expected);
15503    }
15504
15505    #[test]
15506    fn libsais64_unbwt_init_parallel_uses_block_partition_for_large_inputs() {
15507        let n = 70_003usize;
15508        let t: Vec<u8> = (0..n)
15509            .map(|i| i.wrapping_mul(37).wrapping_add(i >> 3) as u8)
15510            .collect();
15511        let i = [12_345u64];
15512
15513        let mut single_p = vec![0u64; n + 1];
15514        let mut threaded_p = vec![0u64; n + 1];
15515        let mut single_bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15516        let mut threaded_bucket2 = vec![0u64; ALPHABET_SIZE * ALPHABET_SIZE];
15517        let mut single_fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15518        let mut threaded_fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15519        let mut buckets = vec![0u64; 4 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)];
15520
15521        unbwt_init_single(
15522            &t,
15523            &mut single_p,
15524            n as SaSint,
15525            None,
15526            &i,
15527            &mut single_bucket2,
15528            &mut single_fastbits,
15529        );
15530        unbwt_init_parallel(
15531            &t,
15532            &mut threaded_p,
15533            n as SaSint,
15534            None,
15535            &i,
15536            &mut threaded_bucket2,
15537            &mut threaded_fastbits,
15538            Some(&mut buckets),
15539            4,
15540        );
15541
15542        assert_eq!(threaded_p, single_p);
15543        assert_eq!(threaded_bucket2, single_bucket2);
15544        assert_eq!(threaded_fastbits, single_fastbits);
15545    }
15546
15547    #[test]
15548    fn libsais64_radix_sort_lms_suffixes_8u_places_suffixes_by_bucket() {
15549        let t = vec![1_u8, 0, 1, 0];
15550        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
15551        let mut induction_bucket = vec![0; 2 * ALPHABET_SIZE];
15552        induction_bucket[buckets_index2(0, 0)] = 2;
15553        induction_bucket[buckets_index2(1, 0)] = 4;
15554        radix_sort_lms_suffixes_8u(&t, &mut sa, &mut induction_bucket, 4, 4);
15555        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
15556    }
15557
15558    #[test]
15559    fn libsais64_radix_sort_lms_suffixes_8u_omp_wraps_sequential_version() {
15560        let t = vec![9_u8, 1, 0, 1, 0];
15561        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
15562        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
15563        buckets[4 * ALPHABET_SIZE + buckets_index2(0, 0)] = 2;
15564        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 4;
15565        let mut thread_state = alloc_thread_state(2).unwrap();
15566        radix_sort_lms_suffixes_8u_omp(&t, &mut sa, 9, 5, 0, &mut buckets, 2, &mut thread_state);
15567        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
15568    }
15569
15570    #[test]
15571    fn libsais64_radix_sort_lms_suffixes_32s_6k_places_suffixes_by_bucket() {
15572        let t = vec![1, 0, 1, 0];
15573        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
15574        let mut induction_bucket = vec![2, 4];
15575        radix_sort_lms_suffixes_32s_6k(&t, &mut sa, &mut induction_bucket, 4, 4);
15576        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
15577    }
15578
15579    #[test]
15580    fn libsais64_radix_sort_lms_suffixes_32s_2k_places_suffixes_by_bucket() {
15581        let t = vec![1, 0, 1, 0];
15582        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
15583        let mut induction_bucket = vec![2, 0, 4, 0];
15584        radix_sort_lms_suffixes_32s_2k(&t, &mut sa, &mut induction_bucket, 4, 4);
15585        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
15586    }
15587
15588    #[test]
15589    fn libsais64_radix_sort_lms_suffixes_32s_6k_omp_wraps_sequential_version() {
15590        let t = vec![9, 1, 0, 1, 0];
15591        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
15592        let mut induction_bucket = vec![2, 4];
15593        let mut thread_state = alloc_thread_state(2).unwrap();
15594        radix_sort_lms_suffixes_32s_6k_omp(
15595            &t,
15596            &mut sa,
15597            9,
15598            5,
15599            &mut induction_bucket,
15600            2,
15601            &mut thread_state,
15602        );
15603        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
15604    }
15605
15606    #[test]
15607    fn libsais64_radix_sort_lms_suffixes_32s_2k_omp_wraps_sequential_version() {
15608        let t = vec![9, 1, 0, 1, 0];
15609        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
15610        let mut induction_bucket = vec![2, 0, 4, 0];
15611        let mut thread_state = alloc_thread_state(2).unwrap();
15612        radix_sort_lms_suffixes_32s_2k_omp(
15613            &t,
15614            &mut sa,
15615            9,
15616            5,
15617            &mut induction_bucket,
15618            2,
15619            &mut thread_state,
15620        );
15621        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
15622    }
15623
15624    #[test]
15625    fn libsais64_radix_sort_lms_suffixes_32s_block_omp_runs_cache_pipeline() {
15626        let t = vec![9, 1, 0, 1, 0];
15627        let mut sa_6k = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
15628        let mut bucket_6k = vec![2, 4];
15629        let mut cache = vec![ThreadCache::default(); 9];
15630        radix_sort_lms_suffixes_32s_6k_block_omp(
15631            &t,
15632            &mut sa_6k,
15633            &mut bucket_6k,
15634            &mut cache,
15635            5,
15636            4,
15637            2,
15638        );
15639        assert_eq!(&sa_6k[..4], &[2, 4, 1, 3]);
15640
15641        let mut sa_2k = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
15642        let mut bucket_2k = vec![2, 0, 4, 0];
15643        cache.fill(ThreadCache::default());
15644        radix_sort_lms_suffixes_32s_2k_block_omp(
15645            &t,
15646            &mut sa_2k,
15647            &mut bucket_2k,
15648            &mut cache,
15649            5,
15650            4,
15651            2,
15652        );
15653        assert_eq!(&sa_2k[..4], &[2, 4, 1, 3]);
15654    }
15655
15656    #[test]
15657    fn libsais64_radix_sort_lms_suffixes_8u_omp_uses_thread_state_for_large_inputs() {
15658        let m = 65_600usize;
15659        let n = 2 * m + 16;
15660        let start = n - m + 1;
15661        let t: Vec<u8> = (0..n).map(|i| (i % 4) as u8).collect();
15662        let suffixes: Vec<SaSint> = (0..m - 1).map(|i| i as SaSint).collect();
15663
15664        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
15665        for &suffix in &suffixes {
15666            buckets[4 * ALPHABET_SIZE + buckets_index2(t[suffix as usize] as usize, 0)] += 1;
15667        }
15668        let mut sum = 0;
15669        for symbol in 0..ALPHABET_SIZE {
15670            let bucket = 4 * ALPHABET_SIZE + buckets_index2(symbol, 0);
15671            sum += buckets[bucket];
15672            buckets[bucket] = sum;
15673        }
15674
15675        let mut sa_single = vec![0; n];
15676        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
15677        let mut sa_threaded = sa_single.clone();
15678        let mut buckets_single = buckets.clone();
15679        let mut buckets_threaded = buckets;
15680        let mut thread_state = alloc_thread_state(4).unwrap();
15681        thread_state[3].m = m as FastSint;
15682
15683        radix_sort_lms_suffixes_8u_omp(
15684            &t,
15685            &mut sa_single,
15686            n as SaSint,
15687            m as SaSint,
15688            0,
15689            &mut buckets_single,
15690            1,
15691            &mut [],
15692        );
15693        radix_sort_lms_suffixes_8u_omp(
15694            &t,
15695            &mut sa_threaded,
15696            n as SaSint,
15697            m as SaSint,
15698            0,
15699            &mut buckets_threaded,
15700            4,
15701            &mut thread_state,
15702        );
15703
15704        assert_eq!(sa_threaded, sa_single);
15705    }
15706
15707    #[test]
15708    fn libsais64_radix_sort_lms_suffixes_32s_omp_uses_block_pipeline_for_large_inputs() {
15709        let m = 65_600usize;
15710        let n = 2 * m + 16;
15711        let start = n - m + 1;
15712        let t: Vec<SaSint> = (0..n).map(|i| (i % 4) as SaSint).collect();
15713        let suffixes: Vec<SaSint> = (0..m - 1).map(|i| i as SaSint).collect();
15714
15715        let mut bucket_ends = vec![0; 4];
15716        for &suffix in &suffixes {
15717            bucket_ends[t[suffix as usize] as usize] += 1;
15718        }
15719        let mut sum = 0;
15720        for bucket in &mut bucket_ends {
15721            sum += *bucket;
15722            *bucket = sum;
15723        }
15724
15725        let mut sa_single = vec![0; n];
15726        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
15727        let mut sa_threaded = sa_single.clone();
15728        let mut bucket_single = bucket_ends.clone();
15729        let mut bucket_threaded = bucket_ends.clone();
15730        let mut thread_state = alloc_thread_state(4).unwrap();
15731
15732        radix_sort_lms_suffixes_32s_6k_omp(
15733            &t,
15734            &mut sa_single,
15735            n as SaSint,
15736            m as SaSint,
15737            &mut bucket_single,
15738            1,
15739            &mut [],
15740        );
15741        radix_sort_lms_suffixes_32s_6k_omp(
15742            &t,
15743            &mut sa_threaded,
15744            n as SaSint,
15745            m as SaSint,
15746            &mut bucket_threaded,
15747            4,
15748            &mut thread_state,
15749        );
15750        assert_eq!(sa_threaded, sa_single);
15751        assert_eq!(bucket_threaded, bucket_single);
15752
15753        let mut bucket_2k = vec![0; 8];
15754        for (symbol, &end) in bucket_ends.iter().enumerate() {
15755            bucket_2k[buckets_index2(symbol, 0)] = end;
15756        }
15757        let mut sa_single = vec![0; n];
15758        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
15759        let mut sa_threaded = sa_single.clone();
15760        let mut bucket_single = bucket_2k.clone();
15761        let mut bucket_threaded = bucket_2k;
15762
15763        radix_sort_lms_suffixes_32s_2k_omp(
15764            &t,
15765            &mut sa_single,
15766            n as SaSint,
15767            m as SaSint,
15768            &mut bucket_single,
15769            1,
15770            &mut [],
15771        );
15772        radix_sort_lms_suffixes_32s_2k_omp(
15773            &t,
15774            &mut sa_threaded,
15775            n as SaSint,
15776            m as SaSint,
15777            &mut bucket_threaded,
15778            4,
15779            &mut thread_state,
15780        );
15781        assert_eq!(sa_threaded, sa_single);
15782        assert_eq!(bucket_threaded, bucket_single);
15783    }
15784
15785    #[test]
15786    fn libsais64_radix_sort_lms_suffixes_32s_1k_collects_lms_suffixes() {
15787        let t = vec![2, 1, 3, 1, 0];
15788        let mut sa = vec![0; t.len()];
15789        let mut buckets = vec![0, 2, 4, 5];
15790        let m = radix_sort_lms_suffixes_32s_1k(&t, &mut sa, t.len() as SaSint, &mut buckets);
15791        assert!(m >= 0);
15792    }
15793
15794    #[test]
15795    fn libsais64_radix_sort_set_markers_32s_6k_marks_target_suffixes() {
15796        let mut sa = vec![0; 6];
15797        let induction_bucket = vec![1, 3, 5];
15798        radix_sort_set_markers_32s_6k(&mut sa, &induction_bucket, 0, 3);
15799        assert_eq!(sa[1], SAINT_MIN);
15800        assert_eq!(sa[3], SAINT_MIN);
15801        assert_eq!(sa[5], SAINT_MIN);
15802    }
15803
15804    #[test]
15805    fn libsais64_radix_sort_set_markers_32s_4k_marks_target_suffixes() {
15806        let mut sa = vec![0; 6];
15807        let induction_bucket = vec![1, 0, 3, 0, 5, 0];
15808        radix_sort_set_markers_32s_4k(&mut sa, &induction_bucket, 0, 3);
15809        assert_eq!(sa[1], SUFFIX_GROUP_MARKER);
15810        assert_eq!(sa[3], SUFFIX_GROUP_MARKER);
15811        assert_eq!(sa[5], SUFFIX_GROUP_MARKER);
15812    }
15813
15814    #[test]
15815    fn libsais64_radix_sort_set_markers_32s_6k_omp_wraps_sequential_version() {
15816        let mut sa = vec![0; 6];
15817        let induction_bucket = vec![1, 3, 5];
15818        radix_sort_set_markers_32s_6k_omp(&mut sa, 4, &induction_bucket, 2);
15819        assert_eq!(sa[1], SAINT_MIN);
15820        assert_eq!(sa[3], SAINT_MIN);
15821        assert_eq!(sa[5], SAINT_MIN);
15822    }
15823
15824    #[test]
15825    fn libsais64_radix_sort_set_markers_32s_4k_omp_wraps_sequential_version() {
15826        let mut sa = vec![0; 6];
15827        let induction_bucket = vec![1, 0, 3, 0, 5, 0];
15828        radix_sort_set_markers_32s_4k_omp(&mut sa, 4, &induction_bucket, 2);
15829        assert_eq!(sa[1], SUFFIX_GROUP_MARKER);
15830        assert_eq!(sa[3], SUFFIX_GROUP_MARKER);
15831        assert_eq!(sa[5], SUFFIX_GROUP_MARKER);
15832    }
15833
15834    #[test]
15835    fn libsais64_radix_sort_set_markers_32s_omp_partitions_large_inputs() {
15836        let k = 65_600usize;
15837        let induction_bucket_6k: Vec<SaSint> = (0..k).map(|i| i as SaSint).collect();
15838        let mut sa_single = vec![0; k];
15839        let mut sa_threaded = vec![0; k];
15840        radix_sort_set_markers_32s_6k_omp(&mut sa_single, k as SaSint, &induction_bucket_6k, 1);
15841        radix_sort_set_markers_32s_6k_omp(&mut sa_threaded, k as SaSint, &induction_bucket_6k, 4);
15842        assert_eq!(sa_threaded, sa_single);
15843
15844        let mut induction_bucket_4k = vec![0; 2 * k];
15845        for i in 0..k {
15846            induction_bucket_4k[buckets_index2(i, 0)] = i as SaSint;
15847        }
15848        let mut sa_single = vec![0; k];
15849        let mut sa_threaded = vec![0; k];
15850        radix_sort_set_markers_32s_4k_omp(&mut sa_single, k as SaSint, &induction_bucket_4k, 1);
15851        radix_sort_set_markers_32s_4k_omp(&mut sa_threaded, k as SaSint, &induction_bucket_4k, 4);
15852        assert_eq!(sa_threaded, sa_single);
15853    }
15854
15855    #[test]
15856    fn libsais64_partial_sorting_scan_left_to_right_8u_emits_induced_suffixes() {
15857        let t = vec![2_u8, 1, 3, 1, 0];
15858        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
15859        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
15860        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 2;
15861        let d = partial_sorting_scan_left_to_right_8u(&t, &mut sa, &mut buckets, 0, 0, 2);
15862        assert!(d >= 0);
15863        assert!(sa.iter().any(|&v| v != 0));
15864    }
15865
15866    #[test]
15867    fn libsais64_partial_sorting_scan_left_to_right_8u_omp_wraps_sequential_version() {
15868        let t = vec![2_u8, 1, 3, 1, 0];
15869        let mut sa = vec![0; 8];
15870        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
15871        buckets[4 * ALPHABET_SIZE + buckets_index2(0, 0)] = 1;
15872        let mut thread_state = alloc_thread_state(2).unwrap();
15873        let d = partial_sorting_scan_left_to_right_8u_omp(
15874            &t,
15875            &mut sa,
15876            5,
15877            4,
15878            &mut buckets,
15879            0,
15880            0,
15881            2,
15882            &mut thread_state,
15883        );
15884        assert!(d >= 1);
15885    }
15886
15887    #[test]
15888    fn libsais64_partial_sorting_scan_left_to_right_32s_6k_emits_induced_suffixes() {
15889        let t = vec![2, 1, 3, 1, 0];
15890        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
15891        let mut buckets = vec![0; 4 * 4];
15892        buckets[buckets_index4(1, 0)] = 2;
15893        let d = partial_sorting_scan_left_to_right_32s_6k(&t, &mut sa, &mut buckets, 0, 0, 2);
15894        assert!(d >= 0);
15895        assert!(sa.iter().any(|&v| v != 0));
15896    }
15897
15898    #[test]
15899    fn libsais64_partial_sorting_scan_left_to_right_32s_4k_emits_induced_suffixes() {
15900        let t = vec![2, 1, 3, 1, 0];
15901        let k = 4usize;
15902        let mut sa = vec![2 | SUFFIX_GROUP_MARKER, 4, 0, 0, 0, 0];
15903        let mut buckets = vec![0; 4 * k];
15904        buckets[2 * k + 1] = 2;
15905        let d = partial_sorting_scan_left_to_right_32s_4k(
15906            &t,
15907            &mut sa,
15908            k as SaSint,
15909            &mut buckets,
15910            0,
15911            0,
15912            2,
15913        );
15914        assert!(d >= 0);
15915        assert!(sa.iter().any(|&v| v != 0));
15916    }
15917
15918    #[test]
15919    fn libsais64_partial_sorting_scan_left_to_right_32s_1k_emits_induced_suffixes() {
15920        let t = vec![2, 1, 3, 1, 0];
15921        let mut sa = vec![2, 4, 0, 0, 0, 0];
15922        let mut buckets = vec![0; 4];
15923        buckets[1] = 2;
15924        partial_sorting_scan_left_to_right_32s_1k(&t, &mut sa, &mut buckets, 0, 2);
15925        assert!(sa.iter().any(|&v| v != 0));
15926    }
15927
15928    #[test]
15929    fn libsais64_partial_sorting_scan_left_to_right_32s_6k_omp_wraps_sequential_version() {
15930        let t = vec![2, 1, 3, 1, 0];
15931        let mut sa = vec![0; 8];
15932        let mut buckets = vec![0; 4 * 4];
15933        let mut thread_state = alloc_thread_state(2).unwrap();
15934        let d = partial_sorting_scan_left_to_right_32s_6k_omp(
15935            &t,
15936            &mut sa,
15937            5,
15938            &mut buckets,
15939            0,
15940            0,
15941            2,
15942            &mut thread_state,
15943        );
15944        assert!(d >= 1);
15945    }
15946
15947    #[test]
15948    fn libsais64_partial_sorting_scan_left_to_right_32s_4k_omp_wraps_sequential_version() {
15949        let t = vec![2, 1, 3, 1, 0];
15950        let k = 4usize;
15951        let mut sa = vec![0; 8];
15952        let mut buckets = vec![0; 4 * k];
15953        let mut thread_state = alloc_thread_state(2).unwrap();
15954        let d = partial_sorting_scan_left_to_right_32s_4k_omp(
15955            &t,
15956            &mut sa,
15957            5,
15958            k as SaSint,
15959            &mut buckets,
15960            0,
15961            2,
15962            &mut thread_state,
15963        );
15964        assert!(d >= 1);
15965    }
15966
15967    #[test]
15968    fn libsais64_partial_sorting_scan_left_to_right_32s_1k_omp_wraps_sequential_version() {
15969        let t = vec![2, 1, 3, 1, 0];
15970        let mut sa = vec![0; 8];
15971        let mut buckets = vec![0; 4];
15972        let mut thread_state = alloc_thread_state(2).unwrap();
15973        partial_sorting_scan_left_to_right_32s_1k_omp(
15974            &t,
15975            &mut sa,
15976            5,
15977            &mut buckets,
15978            2,
15979            &mut thread_state,
15980        );
15981        assert!(sa.iter().any(|&v| v != 0));
15982    }
15983
15984    #[test]
15985    fn libsais64_partial_sorting_scan_left_to_right_32s_6k_block_gather_records_bucket_symbols() {
15986        let t = vec![3, 1, 2, 0];
15987        let mut sa = vec![2 | SAINT_MIN, 0, 0, 0];
15988        let mut cache = vec![ThreadCache::default(); 1];
15989
15990        partial_sorting_scan_left_to_right_32s_6k_block_gather(&t, &mut sa, &mut cache, 0, 1);
15991
15992        assert_eq!(cache[0].index, 2 | SAINT_MIN);
15993        assert_eq!(cache[0].symbol, buckets_index4(1, 1) as SaSint);
15994    }
15995
15996    #[test]
15997    fn libsais64_partial_sorting_scan_left_to_right_32s_1k_block_gather_zeroes_positive_entries() {
15998        let t = vec![3, 1, 2, 0];
15999        let mut sa = vec![2, 0, 0, 0];
16000        let mut cache = vec![ThreadCache::default(); 1];
16001
16002        partial_sorting_scan_left_to_right_32s_1k_block_gather(&t, &mut sa, &mut cache, 0, 1);
16003
16004        assert_eq!(cache[0].symbol, 1);
16005        assert_eq!(cache[0].index, 1);
16006        assert_eq!(sa[0], 0);
16007    }
16008
16009    #[test]
16010    fn libsais64_partial_sorting_scan_left_to_right_32s_1k_block_omp_uses_relative_cache() {
16011        let block_start = 20_000usize;
16012        let block_size = 16_384usize;
16013        let n = block_start + block_size + 8;
16014        let t = vec![1; n];
16015        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
16016
16017        let mut sa_single = vec![0; n];
16018        sa_single[block_start..block_start + block_size].copy_from_slice(&suffixes);
16019        let mut sa_threaded = sa_single.clone();
16020        let mut bucket_single = vec![0, 0];
16021        let mut bucket_threaded = bucket_single.clone();
16022        let mut cache = vec![ThreadCache::default(); 4 * LIBSAIS_PER_THREAD_CACHE_SIZE];
16023
16024        partial_sorting_scan_left_to_right_32s_1k(
16025            &t,
16026            &mut sa_single,
16027            &mut bucket_single,
16028            block_start as FastSint,
16029            block_size as FastSint,
16030        );
16031        partial_sorting_scan_left_to_right_32s_1k_block_omp(
16032            &t,
16033            &mut sa_threaded,
16034            &mut bucket_threaded,
16035            &mut cache,
16036            block_start as FastSint,
16037            block_size as FastSint,
16038            4,
16039        );
16040
16041        assert_eq!(sa_threaded, sa_single);
16042        assert_eq!(bucket_threaded, bucket_single);
16043    }
16044
16045    #[test]
16046    fn libsais64_partial_sorting_scan_left_to_right_8u_block_prepare_records_cache_and_counts() {
16047        let t = vec![2_u8, 1, 3, 1, 0];
16048        let sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
16049        let k = 4;
16050        let mut buckets = vec![0; 4 * k];
16051        let mut cache = vec![ThreadCache::default(); 8];
16052        let mut state = ThreadState::new();
16053        let (position, count) = partial_sorting_scan_left_to_right_8u_block_prepare(
16054            &t,
16055            &sa,
16056            k as SaSint,
16057            &mut buckets,
16058            &mut cache,
16059            0,
16060            2,
16061        );
16062        state.position = position;
16063        state.count = count;
16064        assert!(state.count >= 1);
16065        assert!(cache
16066            .iter()
16067            .take(state.count as usize)
16068            .any(|entry| entry.symbol >= 0));
16069    }
16070
16071    #[test]
16072    fn libsais64_partial_sorting_scan_left_to_right_8u_block_place_writes_induced_values() {
16073        let mut sa = vec![0; 8];
16074        let mut buckets = vec![0; 8];
16075        buckets[0] = 0;
16076        buckets[1] = 1;
16077        let cache = vec![
16078            ThreadCache {
16079                index: 3 | SAINT_MIN,
16080                symbol: 0,
16081            },
16082            ThreadCache {
16083                index: 5,
16084                symbol: 1,
16085            },
16086        ];
16087        partial_sorting_scan_left_to_right_8u_block_place(&mut sa, &mut buckets, 2, &cache, 2, 0);
16088        assert!(sa[0] != 0 || sa[1] != 0);
16089    }
16090
16091    #[test]
16092    fn libsais64_partial_sorting_scan_left_to_right_8u_block_omp_wraps_sequential_version() {
16093        let t = vec![2_u8, 1, 3, 1, 0];
16094        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
16095        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16096        let mut thread_state = alloc_thread_state(2).unwrap();
16097        let d = partial_sorting_scan_left_to_right_8u_block_omp(
16098            &t,
16099            &mut sa,
16100            4,
16101            &mut buckets,
16102            0,
16103            0,
16104            2,
16105            2,
16106            &mut thread_state,
16107        );
16108        assert!(d >= 0);
16109    }
16110
16111    #[test]
16112    fn libsais64_partial_sorting_shift_buckets_32s_6k_moves_temp_bucket_view_into_main_slots() {
16113        let k = 3usize;
16114        let mut buckets = vec![0; 6 * k];
16115        buckets[4 * k] = 10;
16116        buckets[4 * k + 1] = 11;
16117        buckets[4 * k + 2] = 12;
16118        buckets[4 * k + 3] = 13;
16119        partial_sorting_shift_buckets_32s_6k(k as SaSint, &mut buckets);
16120        assert_eq!(buckets[0], 10);
16121        assert_eq!(buckets[1], 11);
16122        assert_eq!(buckets[4], 12);
16123        assert_eq!(buckets[5], 13);
16124    }
16125
16126    #[test]
16127    fn libsais64_partial_sorting_scan_right_to_left_8u_emits_induced_suffixes() {
16128        let t = vec![0_u8, 1, 2, 1, 0];
16129        let mut sa = vec![0, 0, 4 | SAINT_MIN];
16130        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
16131        buckets[buckets_index2(1, 1)] = 2;
16132
16133        let d = partial_sorting_scan_right_to_left_8u(&t, &mut sa, &mut buckets, 0, 2, 1);
16134
16135        assert_eq!(d, 1);
16136        assert_eq!(sa[1], 3 | SAINT_MIN);
16137        assert_eq!(buckets[buckets_index2(1, 1)], 1);
16138        assert_eq!(buckets[2 * ALPHABET_SIZE + buckets_index2(1, 1)], 1);
16139    }
16140
16141    #[test]
16142    fn libsais64_partial_gsa_scan_right_to_left_8u_skips_separator_bucket() {
16143        let t = vec![1_u8, 0, 0];
16144        let mut sa = vec![0, 2 | SAINT_MIN];
16145        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
16146        buckets[buckets_index2(0, 1)] = 2;
16147
16148        let d = partial_gsa_scan_right_to_left_8u(&t, &mut sa, &mut buckets, 0, 1, 1);
16149
16150        assert_eq!(d, 1);
16151        assert_eq!(sa, vec![0, 2 | SAINT_MIN]);
16152        assert_eq!(buckets[buckets_index2(0, 1)], 2);
16153    }
16154
16155    #[test]
16156    fn libsais64_partial_sorting_scan_right_to_left_32s_6k_emits_induced_suffixes() {
16157        let t = vec![0, 1, 2, 1, 0];
16158        let mut sa = vec![0, 0, 4 | SAINT_MIN];
16159        let mut buckets = vec![0; 4 * 3];
16160        buckets[buckets_index4(1, 1)] = 2;
16161
16162        let d = partial_sorting_scan_right_to_left_32s_6k(&t, &mut sa, &mut buckets, 0, 2, 1);
16163
16164        assert_eq!(d, 1);
16165        assert_eq!(sa[1], 3 | SAINT_MIN);
16166        assert_eq!(buckets[buckets_index4(1, 1)], 1);
16167        assert_eq!(buckets[buckets_index4(1, 1) + 2], 1);
16168    }
16169
16170    #[test]
16171    fn libsais64_partial_sorting_scan_right_to_left_32s_1k_omp_wraps_sequential_version() {
16172        let t = vec![0, 1, 2, 1, 0];
16173        let mut sa = vec![0, 0, 4];
16174        let mut buckets = vec![0; 3];
16175        buckets[1] = 2;
16176        let mut thread_state = alloc_thread_state(2).unwrap();
16177
16178        partial_sorting_scan_right_to_left_32s_1k_omp(
16179            &t,
16180            &mut sa,
16181            3,
16182            &mut buckets,
16183            2,
16184            &mut thread_state,
16185        );
16186
16187        assert_eq!(sa[1], 3 | SAINT_MIN);
16188        assert_eq!(buckets[1], 1);
16189    }
16190
16191    #[test]
16192    fn libsais64_partial_sorting_scan_right_to_left_32s_6k_block_gather_records_symbols() {
16193        let t = vec![0, 1, 2, 1, 0];
16194        let sa = vec![0, 4 | SAINT_MIN, 0];
16195        let mut cache = vec![ThreadCache::default(); sa.len()];
16196
16197        partial_sorting_scan_right_to_left_32s_6k_block_gather(&t, &sa, &mut cache, 1, 1);
16198
16199        assert_eq!(cache[0].index, 4 | SAINT_MIN);
16200        assert_eq!(cache[0].symbol, buckets_index4(1, 1) as SaSint);
16201    }
16202
16203    #[test]
16204    fn libsais64_partial_sorting_scan_right_to_left_32s_4k_block_gather_zeroes_positive_entries() {
16205        let t = vec![0, 1, 2, 1, 0];
16206        let mut sa = vec![0, 4 | SUFFIX_GROUP_MARKER, 0];
16207        let mut cache = vec![ThreadCache::default(); sa.len()];
16208
16209        partial_sorting_scan_right_to_left_32s_4k_block_gather(&t, &mut sa, &mut cache, 1, 1);
16210
16211        assert_eq!(sa[1], 0);
16212        assert_eq!(cache[0].index, 4 | SUFFIX_GROUP_MARKER);
16213        assert_eq!(cache[0].symbol, buckets_index2(1, 1) as SaSint);
16214    }
16215
16216    #[test]
16217    fn libsais64_partial_sorting_scan_right_to_left_32s_1k_block_gather_stores_preinduced_entries()
16218    {
16219        let t = vec![0, 1, 2, 1, 0];
16220        let mut sa = vec![0, 4, 0];
16221        let mut cache = vec![ThreadCache::default(); sa.len()];
16222
16223        partial_sorting_scan_right_to_left_32s_1k_block_gather(&t, &mut sa, &mut cache, 1, 1);
16224
16225        assert_eq!(sa[1], 0);
16226        assert_eq!(cache[0].index, 3 | SAINT_MIN);
16227        assert_eq!(cache[0].symbol, 1);
16228    }
16229
16230    #[test]
16231    fn libsais64_partial_sorting_scan_right_to_left_32s_6k_block_sort_updates_bucket_and_marker_state(
16232    ) {
16233        let t = vec![0, 1, 2, 1, 0];
16234        let mut cache = vec![ThreadCache::default(); 3];
16235        cache[0].index = 4 | SAINT_MIN;
16236        cache[0].symbol = buckets_index4(1, 1) as SaSint;
16237        let mut buckets = vec![0; 4 * 3];
16238        buckets[buckets_index4(1, 1)] = 2;
16239
16240        let d = partial_sorting_scan_right_to_left_32s_6k_block_sort(
16241            &t,
16242            &mut buckets,
16243            0,
16244            &mut cache,
16245            1,
16246            1,
16247        );
16248
16249        assert_eq!(d, 1);
16250        assert_eq!(cache[0].index, 3 | SAINT_MIN);
16251        assert_eq!(buckets[buckets_index4(1, 1)], 1);
16252        assert_eq!(buckets[buckets_index4(1, 1) + 2], 1);
16253    }
16254
16255    #[test]
16256    fn libsais64_partial_sorting_scan_right_to_left_32s_1k_block_omp_places_cached_suffixes() {
16257        let t = vec![0, 1, 2, 1, 0];
16258        let mut sa = vec![0, 4, 0];
16259        let mut buckets = vec![0; 3];
16260        buckets[1] = 2;
16261        let mut cache = vec![ThreadCache::default(); sa.len()];
16262
16263        partial_sorting_scan_right_to_left_32s_1k_block_omp(
16264            &t,
16265            &mut sa,
16266            &mut buckets,
16267            &mut cache,
16268            1,
16269            1,
16270            2,
16271        );
16272
16273        assert_eq!(sa[1], 3 | SAINT_MIN);
16274        assert_eq!(buckets[1], 1);
16275    }
16276
16277    #[test]
16278    fn libsais64_partial_sorting_scan_right_to_left_32s_1k_block_omp_uses_relative_cache() {
16279        let block_start = 20_000usize;
16280        let block_size = 16_384usize;
16281        let n = block_start + block_size + 8;
16282        let t = vec![1; n];
16283        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
16284
16285        let mut sa_single = vec![0; n];
16286        sa_single[block_start..block_start + block_size].copy_from_slice(&suffixes);
16287        let mut sa_threaded = sa_single.clone();
16288        let mut bucket_single = vec![0, block_size as SaSint];
16289        let mut bucket_threaded = bucket_single.clone();
16290        let mut cache = vec![ThreadCache::default(); 4 * LIBSAIS_PER_THREAD_CACHE_SIZE];
16291
16292        partial_sorting_scan_right_to_left_32s_1k(
16293            &t,
16294            &mut sa_single,
16295            &mut bucket_single,
16296            block_start as FastSint,
16297            block_size as FastSint,
16298        );
16299        partial_sorting_scan_right_to_left_32s_1k_block_omp(
16300            &t,
16301            &mut sa_threaded,
16302            &mut bucket_threaded,
16303            &mut cache,
16304            block_start as FastSint,
16305            block_size as FastSint,
16306            4,
16307        );
16308
16309        assert_eq!(sa_threaded, sa_single);
16310        assert_eq!(bucket_threaded, bucket_single);
16311    }
16312
16313    #[test]
16314    fn libsais64_partial_sorting_gather_lms_suffixes_32s_4k_compacts_negative_marked_entries() {
16315        let mut sa = vec![1 | SUFFIX_GROUP_MARKER, -3, 5 | SUFFIX_GROUP_MARKER, -7];
16316        let n = sa.len() as FastSint;
16317
16318        let l = partial_sorting_gather_lms_suffixes_32s_4k(&mut sa, 0, n);
16319
16320        assert_eq!(l, 2);
16321        assert_eq!(sa[0], (SAINT_MIN | SUFFIX_GROUP_MARKER) - 3);
16322        assert_eq!(sa[1], (SAINT_MIN | SUFFIX_GROUP_MARKER) - 7);
16323    }
16324
16325    #[test]
16326    fn libsais64_partial_sorting_gather_lms_suffixes_32s_1k_compacts_negative_marked_entries() {
16327        let mut sa = vec![1, -3, 5, -7];
16328        let n = sa.len() as FastSint;
16329
16330        let l = partial_sorting_gather_lms_suffixes_32s_1k(&mut sa, 0, n);
16331
16332        assert_eq!(l, 2);
16333        assert_eq!(sa[0], SAINT_MAX - 2);
16334        assert_eq!(sa[1], SAINT_MAX - 6);
16335    }
16336
16337    #[test]
16338    fn libsais64_partial_sorting_gather_lms_suffixes_32s_4k_omp_wraps_sequential_version() {
16339        let mut sa = vec![1 | SUFFIX_GROUP_MARKER, -3, 5 | SUFFIX_GROUP_MARKER, -7];
16340        let mut thread_state = alloc_thread_state(2).unwrap();
16341
16342        partial_sorting_gather_lms_suffixes_32s_4k_omp(&mut sa, 4, 2, &mut thread_state);
16343
16344        assert_eq!(sa[0], (SAINT_MIN | SUFFIX_GROUP_MARKER) - 3);
16345        assert_eq!(sa[1], (SAINT_MIN | SUFFIX_GROUP_MARKER) - 7);
16346    }
16347
16348    #[test]
16349    fn libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp_wraps_sequential_version() {
16350        let mut sa = vec![1, -3, 5, -7];
16351        let mut thread_state = alloc_thread_state(2).unwrap();
16352
16353        partial_sorting_gather_lms_suffixes_32s_1k_omp(&mut sa, 4, 2, &mut thread_state);
16354
16355        assert_eq!(sa[0], SAINT_MAX - 2);
16356        assert_eq!(sa[1], SAINT_MAX - 6);
16357    }
16358
16359    #[test]
16360    fn libsais64_partial_sorting_gather_lms_suffixes_32s_omp_uses_block_partition() {
16361        let n = 65_600usize;
16362        let input_4k: Vec<SaSint> = (0..n)
16363            .map(|i| {
16364                let value = (i as SaSint) | SUFFIX_GROUP_MARKER;
16365                if i % 5 == 0 {
16366                    value | SAINT_MIN
16367                } else {
16368                    value
16369                }
16370            })
16371            .collect();
16372        let count_4k = input_4k.iter().filter(|&&value| value < 0).count();
16373
16374        let mut single = input_4k.clone();
16375        let mut threaded = input_4k;
16376        let mut thread_state = alloc_thread_state(4).unwrap();
16377        partial_sorting_gather_lms_suffixes_32s_4k_omp(&mut single, n as SaSint, 1, &mut []);
16378        partial_sorting_gather_lms_suffixes_32s_4k_omp(
16379            &mut threaded,
16380            n as SaSint,
16381            4,
16382            &mut thread_state,
16383        );
16384        assert_eq!(&threaded[..count_4k], &single[..count_4k]);
16385
16386        let input_1k: Vec<SaSint> = (0..n)
16387            .map(|i| {
16388                let value = i as SaSint;
16389                if i % 7 == 0 {
16390                    value | SAINT_MIN
16391                } else {
16392                    value
16393                }
16394            })
16395            .collect();
16396        let count_1k = input_1k.iter().filter(|&&value| value < 0).count();
16397
16398        let mut single = input_1k.clone();
16399        let mut threaded = input_1k;
16400        partial_sorting_gather_lms_suffixes_32s_1k_omp(&mut single, n as SaSint, 1, &mut []);
16401        partial_sorting_gather_lms_suffixes_32s_1k_omp(
16402            &mut threaded,
16403            n as SaSint,
16404            4,
16405            &mut thread_state,
16406        );
16407        assert_eq!(&threaded[..count_1k], &single[..count_1k]);
16408    }
16409
16410    #[test]
16411    fn libsais64_partial_sorting_shift_markers_8u_omp_toggles_segment_markers() {
16412        let mut sa = vec![1 | SAINT_MIN, 2 | SAINT_MIN, 3, 4 | SAINT_MIN, 5];
16413        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16414        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 5;
16415        buckets[buckets_index2(0, 0)] = 0;
16416        let len = sa.len() as SaSint;
16417        partial_sorting_shift_markers_8u_omp(&mut sa, len, &buckets, 1);
16418        assert!(sa.iter().any(|&v| (v & SAINT_MIN) == 0));
16419    }
16420
16421    #[test]
16422    fn libsais64_partial_sorting_shift_markers_32s_6k_omp_toggles_segment_markers() {
16423        let mut sa = vec![1 | SAINT_MIN, 2 | SAINT_MIN, 3, 4 | SAINT_MIN, 5];
16424        let k = 3usize;
16425        let mut buckets = vec![0; 6 * k];
16426        buckets[buckets_index4(1, 0)] = 5;
16427        buckets[4 * k + buckets_index2(0, 0)] = 0;
16428        partial_sorting_shift_markers_32s_6k_omp(&mut sa, k as SaSint, &buckets, 1);
16429        assert!(sa.iter().any(|&v| (v & SAINT_MIN) == 0));
16430    }
16431
16432    #[test]
16433    fn libsais64_partial_sorting_shift_markers_32s_4k_toggles_group_markers() {
16434        let mut sa = vec![
16435            1 | SUFFIX_GROUP_MARKER,
16436            2 | SUFFIX_GROUP_MARKER,
16437            3,
16438            4 | SUFFIX_GROUP_MARKER,
16439        ];
16440        let len = sa.len() as SaSint;
16441        partial_sorting_shift_markers_32s_4k(&mut sa, len);
16442        assert!(sa.iter().any(|&v| (v & SUFFIX_GROUP_MARKER) == 0));
16443    }
16444
16445    #[test]
16446    fn libsais64_clear_lms_suffixes_omp_zeroes_requested_bucket_ranges() {
16447        let mut sa = vec![5, 4, 3, 2, 1, 9];
16448        let n = sa.len() as SaSint;
16449        let bucket_start = vec![1, 4, 5];
16450        let bucket_end = vec![3, 5, 5];
16451
16452        clear_lms_suffixes_omp(&mut sa, n, 3, &bucket_start, &bucket_end, 2);
16453
16454        assert_eq!(sa, vec![5, 0, 0, 2, 0, 9]);
16455    }
16456
16457    #[test]
16458    fn libsais64_final_bwt_scan_left_to_right_8u_rewrites_sa_and_induces_suffixes() {
16459        let t = vec![0_u8, 1, 2, 1, 0];
16460        let mut sa = vec![1, 0, 0];
16461        let mut induction_bucket = vec![0, 1, 3];
16462
16463        final_bwt_scan_left_to_right_8u(&t, &mut sa, &mut induction_bucket, 0, 1);
16464
16465        assert_eq!(sa[0], 0);
16466        assert_eq!(induction_bucket[0], 1);
16467    }
16468
16469    #[test]
16470    fn libsais64_final_bwt_aux_scan_left_to_right_8u_updates_sampling_array() {
16471        let t = vec![0_u8, 1, 2, 1, 0];
16472        let mut sa = vec![1, 0, 0];
16473        let mut induction_bucket = vec![0, 1, 3];
16474        let mut i_out = vec![0; 2];
16475
16476        final_bwt_aux_scan_left_to_right_8u(
16477            &t,
16478            &mut sa,
16479            0,
16480            &mut i_out,
16481            &mut induction_bucket,
16482            0,
16483            1,
16484        );
16485
16486        assert_eq!(i_out[0], 1);
16487    }
16488
16489    #[test]
16490    fn libsais64_final_sorting_scan_left_to_right_8u_clears_marker_and_places_suffix() {
16491        let t = vec![0_u8, 1, 2, 1, 0];
16492        let mut sa = vec![1, 0, 0];
16493        let mut induction_bucket = vec![0, 1, 3];
16494
16495        final_sorting_scan_left_to_right_8u(&t, &mut sa, &mut induction_bucket, 0, 1);
16496
16497        assert_eq!(sa[0], 0);
16498        assert_eq!(induction_bucket[0], 1);
16499    }
16500
16501    #[test]
16502    fn libsais64_final_sorting_scan_left_to_right_32s_clears_marker_and_places_suffix() {
16503        let t = vec![0, 1, 2, 1, 0];
16504        let mut sa = vec![1, 0, 0];
16505        let mut induction_bucket = vec![0, 1, 3];
16506
16507        final_sorting_scan_left_to_right_32s(&t, &mut sa, &mut induction_bucket, 0, 1);
16508
16509        assert_eq!(sa[0], 0);
16510        assert_eq!(induction_bucket[0], 1);
16511    }
16512
16513    #[test]
16514    fn libsais64_final_bwt_scan_left_to_right_8u_block_prepare_records_cache_and_counts() {
16515        let t = vec![0_u8, 1, 2, 1, 0];
16516        let mut sa = vec![1, 2, 0];
16517        let mut buckets = vec![99; ALPHABET_SIZE];
16518        let mut cache = vec![ThreadCache::default(); 4];
16519
16520        let count = final_bwt_scan_left_to_right_8u_block_prepare(
16521            &t,
16522            &mut sa,
16523            ALPHABET_SIZE as SaSint,
16524            &mut buckets,
16525            &mut cache,
16526            0,
16527            2,
16528        );
16529
16530        assert_eq!(count, 2);
16531        assert_eq!(sa[0] & SAINT_MAX, 0);
16532        assert_eq!(sa[1], 1 | SAINT_MIN);
16533        assert_eq!(buckets[0], 1);
16534        assert_eq!(buckets[1], 1);
16535        assert_eq!(cache[0].symbol, 0);
16536        assert_eq!(cache[0].index & SAINT_MAX, 0);
16537        assert_eq!(cache[1].symbol, 1);
16538        assert_eq!(cache[1].index & SAINT_MAX, 1);
16539    }
16540
16541    #[test]
16542    fn libsais64_final_sorting_scan_left_to_right_8u_omp_wraps_sequential_behavior() {
16543        let t = vec![0_u8, 1, 2, 1, 0];
16544        let mut sa = vec![0; t.len()];
16545        let mut induction_bucket = vec![0, 1, 3];
16546        let mut expected_sa = sa.clone();
16547        let mut expected_bucket = induction_bucket.clone();
16548
16549        final_sorting_scan_left_to_right_8u_omp(
16550            &t,
16551            &mut expected_sa,
16552            t.len() as FastSint,
16553            ALPHABET_SIZE as SaSint,
16554            &mut expected_bucket,
16555            1,
16556            &mut [],
16557        );
16558
16559        let mut thread_state = alloc_thread_state(2).unwrap();
16560
16561        final_sorting_scan_left_to_right_8u_omp(
16562            &t,
16563            &mut sa,
16564            t.len() as FastSint,
16565            ALPHABET_SIZE as SaSint,
16566            &mut induction_bucket,
16567            2,
16568            &mut thread_state,
16569        );
16570
16571        assert_eq!(sa, expected_sa);
16572        assert_eq!(induction_bucket, expected_bucket);
16573    }
16574
16575    #[test]
16576    fn libsais64_final_bwt_scan_right_to_left_8u_returns_zero_index_and_induces_suffixes() {
16577        let t = vec![0_u8, 1, 2, 1, 0];
16578        let mut sa = vec![0, 2, 0];
16579        let mut induction_bucket = vec![1, 2, 3];
16580
16581        let index = final_bwt_scan_right_to_left_8u(&t, &mut sa, &mut induction_bucket, 0, 2);
16582
16583        assert_eq!(index, 0);
16584        assert_eq!(sa[1], 1);
16585        assert_eq!(induction_bucket[1], 1);
16586    }
16587
16588    #[test]
16589    fn libsais64_final_sorting_scan_right_to_left_8u_omp_matches_sequential_path() {
16590        let t = vec![0_u8, 1, 2, 1, 0];
16591        let mut sa = vec![0, 2, 0, 0];
16592        let mut induction_bucket = vec![1, 2, 3];
16593        let mut expected_sa = sa.clone();
16594        let mut expected_bucket = induction_bucket.clone();
16595
16596        final_sorting_scan_right_to_left_8u_omp(
16597            &t,
16598            &mut expected_sa,
16599            0,
16600            2,
16601            ALPHABET_SIZE as SaSint,
16602            &mut expected_bucket,
16603            1,
16604            &mut [],
16605        );
16606
16607        let mut thread_state = alloc_thread_state(2).unwrap();
16608        final_sorting_scan_right_to_left_8u_omp(
16609            &t,
16610            &mut sa,
16611            0,
16612            2,
16613            ALPHABET_SIZE as SaSint,
16614            &mut induction_bucket,
16615            2,
16616            &mut thread_state,
16617        );
16618
16619        assert_eq!(sa, expected_sa);
16620        assert_eq!(induction_bucket, expected_bucket);
16621    }
16622
16623    #[test]
16624    fn libsais64_induce_final_order_8u_omp_non_bwt_matches_direct_final_scans() {
16625        let t = vec![0_u8, 1, 2, 1, 0];
16626        let mut sa = vec![0, 2, 0, 0, 0];
16627        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
16628        buckets[6 * ALPHABET_SIZE..6 * ALPHABET_SIZE + 3].copy_from_slice(&[0, 1, 3]);
16629        buckets[7 * ALPHABET_SIZE..7 * ALPHABET_SIZE + 3].copy_from_slice(&[2, 4, 5]);
16630
16631        let mut expected_sa = sa.clone();
16632        let mut expected_left = vec![0, 1, 3];
16633        let mut expected_right = vec![2, 4, 5];
16634        final_sorting_scan_left_to_right_8u_omp(
16635            &t,
16636            &mut expected_sa,
16637            t.len() as FastSint,
16638            ALPHABET_SIZE as SaSint,
16639            &mut expected_left,
16640            1,
16641            &mut [],
16642        );
16643        final_sorting_scan_right_to_left_8u_omp(
16644            &t,
16645            &mut expected_sa,
16646            0,
16647            t.len() as FastSint,
16648            ALPHABET_SIZE as SaSint,
16649            &mut expected_right,
16650            1,
16651            &mut [],
16652        );
16653
16654        let mut thread_state = alloc_thread_state(2).unwrap();
16655        let result = induce_final_order_8u_omp(
16656            &t,
16657            &mut sa,
16658            t.len() as SaSint,
16659            ALPHABET_SIZE as SaSint,
16660            LIBSAIS_FLAGS_NONE,
16661            0,
16662            None,
16663            &mut buckets,
16664            2,
16665            &mut thread_state,
16666        );
16667
16668        assert_eq!(result, 0);
16669        assert_eq!(sa, expected_sa);
16670        assert_eq!(
16671            &buckets[6 * ALPHABET_SIZE..6 * ALPHABET_SIZE + 3],
16672            expected_left.as_slice()
16673        );
16674        assert_eq!(
16675            &buckets[7 * ALPHABET_SIZE..7 * ALPHABET_SIZE + 3],
16676            expected_right.as_slice()
16677        );
16678    }
16679
16680    #[test]
16681    fn libsais64_count_helpers_match_c_predicates() {
16682        let sa = [1, -1, 0, -3, 4, 0, -9];
16683        assert_eq!(
16684            count_negative_marked_suffixes(&sa, 0, sa.len() as FastSint),
16685            3
16686        );
16687        assert_eq!(count_zero_marked_suffixes(&sa, 0, sa.len() as FastSint), 2);
16688        assert_eq!(count_negative_marked_suffixes(&sa, 2, 3), 1);
16689        assert_eq!(count_zero_marked_suffixes(&sa, 2, 3), 1);
16690    }
16691
16692    #[test]
16693    fn libsais64_flip_suffix_markers_omp_toggles_saint_min_bits() {
16694        let mut sa = vec![1, -2, 3, -4];
16695        flip_suffix_markers_omp(&mut sa, 4, 1);
16696        assert_eq!(
16697            sa,
16698            vec![1 ^ SAINT_MIN, -2 ^ SAINT_MIN, 3 ^ SAINT_MIN, -4 ^ SAINT_MIN]
16699        );
16700    }
16701
16702    #[test]
16703    fn libsais64_place_cached_suffixes_writes_indices_to_symbol_slots() {
16704        let mut sa = vec![0; 8];
16705        let cache = vec![
16706            ThreadCache {
16707                symbol: 2,
16708                index: 10,
16709            },
16710            ThreadCache {
16711                symbol: 5,
16712                index: 20,
16713            },
16714            ThreadCache {
16715                symbol: 1,
16716                index: 30,
16717            },
16718        ];
16719
16720        place_cached_suffixes(&mut sa, &cache, 0, cache.len() as FastSint);
16721
16722        assert_eq!(sa[2], 10);
16723        assert_eq!(sa[5], 20);
16724        assert_eq!(sa[1], 30);
16725    }
16726
16727    #[test]
16728    fn libsais64_compact_and_place_cached_suffixes_discards_negative_symbols() {
16729        let mut sa = vec![0; 8];
16730        let mut cache = vec![
16731            ThreadCache {
16732                symbol: 2,
16733                index: 10,
16734            },
16735            ThreadCache {
16736                symbol: -1,
16737                index: 99,
16738            },
16739            ThreadCache {
16740                symbol: 5,
16741                index: 20,
16742            },
16743            ThreadCache {
16744                symbol: -4,
16745                index: 77,
16746            },
16747            ThreadCache {
16748                symbol: 1,
16749                index: 30,
16750            },
16751        ];
16752        let cache_len = cache.len() as FastSint;
16753
16754        compact_and_place_cached_suffixes(&mut sa, &mut cache, 0, cache_len);
16755
16756        assert_eq!(sa[2], 10);
16757        assert_eq!(sa[5], 20);
16758        assert_eq!(sa[1], 30);
16759        assert_eq!(
16760            cache[0],
16761            ThreadCache {
16762                symbol: 2,
16763                index: 10
16764            }
16765        );
16766        assert_eq!(
16767            cache[1],
16768            ThreadCache {
16769                symbol: 5,
16770                index: 20
16771            }
16772        );
16773        assert_eq!(
16774            cache[2],
16775            ThreadCache {
16776                symbol: 1,
16777                index: 30
16778            }
16779        );
16780    }
16781
16782    #[test]
16783    fn libsais64_gather_lms_suffixes_32s_collects_expected_suffix_starts() {
16784        let t = vec![2, 1, 3, 1, 0];
16785        let mut sa = vec![0; t.len()];
16786        let m = gather_lms_suffixes_32s(&t, &mut sa, t.len() as SaSint);
16787        assert!(m >= 0);
16788        assert!(sa
16789            .iter()
16790            .all(|&value| value >= 0 && value <= t.len() as SaSint));
16791        assert!(sa[t.len() - 1] >= 1 && sa[t.len() - 1] <= t.len() as SaSint - 1);
16792    }
16793
16794    #[test]
16795    fn libsais64_gather_compacted_lms_suffixes_32s_skips_negative_marked_symbols() {
16796        let t = vec![2, -1, 3, 1, 0];
16797        let mut sa = vec![0; t.len()];
16798        let m = gather_compacted_lms_suffixes_32s(&t, &mut sa, t.len() as SaSint);
16799        assert!(m >= 0);
16800        assert!(sa
16801            .iter()
16802            .all(|&value| value >= 0 && value <= t.len() as SaSint));
16803    }
16804
16805    #[test]
16806    fn libsais64_count_lms_suffixes_32s_2k_counts_two_bucket_categories() {
16807        let t = vec![2, 1, 3, 1, 0];
16808        let mut buckets = vec![0; 2 * 4];
16809        count_lms_suffixes_32s_2k(&t, t.len() as SaSint, 4, &mut buckets);
16810        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16811    }
16812
16813    #[test]
16814    fn libsais64_count_lms_suffixes_32s_4k_counts_four_bucket_categories() {
16815        let t = vec![2, 1, 3, 1, 0];
16816        let mut buckets = vec![0; 4 * 4];
16817        count_lms_suffixes_32s_4k(&t, t.len() as SaSint, 4, &mut buckets);
16818        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16819    }
16820
16821    #[test]
16822    fn libsais64_count_compacted_lms_suffixes_32s_2k_masks_saint_bits() {
16823        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
16824        let mut buckets = vec![0; 2 * 4];
16825        count_compacted_lms_suffixes_32s_2k(&t, t.len() as SaSint, 4, &mut buckets);
16826        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16827    }
16828
16829    #[test]
16830    fn libsais64_count_and_gather_lms_suffixes_8u_updates_sa_and_buckets() {
16831        let t = vec![2_u8, 1, 3, 1, 0];
16832        let mut sa = vec![0; t.len()];
16833        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
16834        let m = count_and_gather_lms_suffixes_8u(
16835            &t,
16836            &mut sa,
16837            t.len() as SaSint,
16838            &mut buckets,
16839            0,
16840            t.len() as FastSint,
16841        );
16842        assert_eq!(m, 1);
16843        assert_eq!(sa[t.len() - 1], 1);
16844        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16845    }
16846
16847    #[test]
16848    fn libsais64_count_and_gather_lms_suffixes_8u_omp_preserves_sequential_wrapper_behavior() {
16849        let t = vec![2_u8, 1, 3, 1, 0];
16850        let mut sa = vec![0; t.len()];
16851        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
16852        let mut thread_state = alloc_thread_state(2).unwrap();
16853        let m = count_and_gather_lms_suffixes_8u_omp(
16854            &t,
16855            &mut sa,
16856            t.len() as SaSint,
16857            &mut buckets,
16858            2,
16859            &mut thread_state,
16860        );
16861        assert_eq!(m, 1);
16862        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16863    }
16864
16865    #[test]
16866    fn libsais64_get_bucket_stride_prefers_aligned_sizes_when_space_allows() {
16867        assert_eq!(get_bucket_stride(8192, 1000, 2), 1024);
16868        assert_eq!(get_bucket_stride(256, 17, 2), 32);
16869        assert_eq!(get_bucket_stride(8, 17, 2), 17);
16870    }
16871
16872    #[test]
16873    fn libsais64_count_suffixes_32s_counts_symbol_histogram() {
16874        let t = vec![2, 1, 2, 3, 1, 0, 2];
16875        let mut buckets = vec![0; 4];
16876        count_suffixes_32s(&t, t.len() as SaSint, 4, &mut buckets);
16877        assert_eq!(buckets, vec![1, 2, 3, 1]);
16878    }
16879
16880    #[test]
16881    fn libsais64_initialize_buckets_start_and_end_8u_sets_ranges_and_freq() {
16882        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
16883        buckets[buckets_index4(0, 0)] = 1;
16884        buckets[buckets_index4(1, 1)] = 2;
16885        buckets[buckets_index4(2, 3)] = 3;
16886        let mut freq = vec![0; ALPHABET_SIZE];
16887        let k = initialize_buckets_start_and_end_8u(&mut buckets, Some(&mut freq));
16888        assert_eq!(k, 3);
16889        assert_eq!(freq[0], 1);
16890        assert_eq!(freq[1], 2);
16891        assert_eq!(freq[2], 3);
16892        assert_eq!(buckets[6 * ALPHABET_SIZE], 0);
16893        assert_eq!(buckets[7 * ALPHABET_SIZE], 1);
16894        assert_eq!(buckets[6 * ALPHABET_SIZE + 1], 1);
16895        assert_eq!(buckets[7 * ALPHABET_SIZE + 1], 3);
16896    }
16897
16898    #[test]
16899    fn libsais64_initialize_buckets_start_and_end_32s_6k_sets_prefix_ranges() {
16900        let k = 3;
16901        let mut buckets = vec![0; 6 * k];
16902        buckets[buckets_index4(0, 0)] = 1;
16903        buckets[buckets_index4(0, 1)] = 2;
16904        buckets[buckets_index4(1, 2)] = 3;
16905        buckets[buckets_index4(2, 3)] = 4;
16906        initialize_buckets_start_and_end_32s_6k(k as SaSint, &mut buckets);
16907        assert_eq!(&buckets[4 * k..5 * k], &[0, 3, 6]);
16908        assert_eq!(&buckets[5 * k..6 * k], &[3, 6, 10]);
16909    }
16910
16911    #[test]
16912    fn libsais64_initialize_buckets_start_and_end_32s_4k_sets_prefix_ranges() {
16913        let k = 3;
16914        let mut buckets = vec![0; 4 * k];
16915        buckets[buckets_index2(0, 0)] = 1;
16916        buckets[buckets_index2(0, 1)] = 2;
16917        buckets[buckets_index2(1, 0)] = 3;
16918        buckets[buckets_index2(2, 1)] = 4;
16919        initialize_buckets_start_and_end_32s_4k(k as SaSint, &mut buckets);
16920        assert_eq!(&buckets[2 * k..3 * k], &[0, 3, 6]);
16921        assert_eq!(&buckets[3 * k..4 * k], &[3, 6, 10]);
16922    }
16923
16924    #[test]
16925    fn libsais64_initialize_buckets_end_32s_2k_rewrites_first_lanes_to_end_positions() {
16926        let k = 3;
16927        let mut buckets = vec![1, 2, 3, 4, 5, 6];
16928        initialize_buckets_end_32s_2k(k as SaSint, &mut buckets);
16929        assert_eq!(buckets[0], 3);
16930        assert_eq!(buckets[2], 10);
16931        assert_eq!(buckets[4], 21);
16932    }
16933
16934    #[test]
16935    fn libsais64_initialize_buckets_start_and_end_32s_2k_copies_start_positions() {
16936        let k = 3;
16937        let mut buckets = vec![3, 2, 10, 4, 21, 6];
16938        initialize_buckets_start_and_end_32s_2k(k as SaSint, &mut buckets);
16939        assert_eq!(&buckets[..k], &[3, 10, 21]);
16940        assert_eq!(&buckets[k..2 * k], &[0, 3, 10]);
16941    }
16942
16943    #[test]
16944    fn libsais64_initialize_buckets_start_32s_1k_builds_prefix_starts() {
16945        let mut buckets = vec![1, 2, 3];
16946        initialize_buckets_start_32s_1k(3, &mut buckets);
16947        assert_eq!(buckets, vec![0, 1, 3]);
16948    }
16949
16950    #[test]
16951    fn libsais64_initialize_buckets_end_32s_1k_builds_prefix_ends() {
16952        let mut buckets = vec![1, 2, 3];
16953        initialize_buckets_end_32s_1k(3, &mut buckets);
16954        assert_eq!(buckets, vec![1, 3, 6]);
16955    }
16956
16957    #[test]
16958    fn libsais64_initialize_buckets_for_lms_suffixes_radix_sort_8u_returns_total_lms_slots() {
16959        let t = vec![2_u8, 1, 3, 1, 0];
16960        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16961        buckets[buckets_index4(0, 1)] = 1;
16962        buckets[buckets_index4(1, 3)] = 2;
16963        let sum = initialize_buckets_for_lms_suffixes_radix_sort_8u(&t, &mut buckets, 4);
16964        assert!(sum >= 0);
16965    }
16966
16967    #[test]
16968    fn libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k_rewrites_two_lane_prefixes()
16969    {
16970        let t = vec![2, 1, 3, 1, 0];
16971        let mut buckets = vec![0; 2 * 4];
16972        initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(&t, 4, &mut buckets, 4);
16973        assert!(buckets.iter().any(|&v| v != 0));
16974    }
16975
16976    #[test]
16977    fn libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k_returns_total_lms_slots() {
16978        let t = vec![2, 1, 3, 1, 0];
16979        let mut buckets = vec![0; 6 * 4];
16980        buckets[buckets_index4(0, 1)] = 1;
16981        buckets[buckets_index4(1, 3)] = 2;
16982        let sum = initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(&t, 4, &mut buckets, 4);
16983        assert!(sum >= 0);
16984    }
16985
16986    #[test]
16987    fn libsais64_initialize_buckets_for_radix_and_partial_sorting_32s_4k_sets_start_end_views() {
16988        let t = vec![2, 1, 3, 1, 0];
16989        let k = 4usize;
16990        let mut buckets = vec![0; 4 * k];
16991        buckets[buckets_index2(0, 0)] = 1;
16992        buckets[buckets_index2(0, 1)] = 2;
16993        buckets[buckets_index2(1, 0)] = 3;
16994        initialize_buckets_for_radix_and_partial_sorting_32s_4k(&t, k as SaSint, &mut buckets, 4);
16995        assert_eq!(buckets[2 * k], 0);
16996        assert!(buckets[3 * k] >= buckets[2 * k]);
16997    }
16998
16999    #[test]
17000    fn libsais64_initialize_buckets_for_partial_sorting_8u_sets_start_and_distinct_views() {
17001        let t = vec![2_u8, 1, 3, 1, 0];
17002        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
17003        buckets[buckets_index4(0, 0)] = 1;
17004        buckets[buckets_index4(0, 2)] = 2;
17005        initialize_buckets_for_partial_sorting_8u(&t, &mut buckets, 4, 3);
17006        assert!(buckets[0] >= 4);
17007        assert!(buckets[1] >= 0);
17008        assert!(buckets[4 * ALPHABET_SIZE] >= 4);
17009    }
17010
17011    #[test]
17012    fn libsais64_initialize_buckets_for_partial_sorting_32s_6k_rewrites_bucket_views() {
17013        let t = vec![2, 1, 3, 1, 0];
17014        let k = 4usize;
17015        let mut buckets = vec![0; 6 * k];
17016        buckets[buckets_index4(0, 0)] = 1;
17017        buckets[buckets_index4(0, 1)] = 2;
17018        buckets[buckets_index4(1, 2)] = 3;
17019        initialize_buckets_for_partial_sorting_32s_6k(&t, k as SaSint, &mut buckets, 4, 3);
17020        assert!(buckets[0] >= 4);
17021        assert!(buckets[4 * k] >= 4);
17022    }
17023
17024    #[test]
17025    fn libsais64_place_lms_suffixes_interval_32s_4k_moves_suffixes_into_bucket_intervals() {
17026        let mut sa = vec![10, 11, 12, 13, 14];
17027        let k = 3usize;
17028        let mut buckets = vec![0; 4 * k];
17029        buckets[buckets_index2(0, 1)] = 0;
17030        buckets[buckets_index2(1, 1)] = 2;
17031        buckets[buckets_index2(2, 1)] = 3;
17032        buckets[3 * k] = 2;
17033        buckets[3 * k + 1] = 5;
17034
17035        place_lms_suffixes_interval_32s_4k(&mut sa, 5, k as SaSint, 5, &buckets);
17036
17037        assert_eq!(sa, vec![0, 0, 0, 0, 14]);
17038    }
17039
17040    #[test]
17041    fn libsais64_place_lms_suffixes_interval_32s_2k_moves_suffixes_into_bucket_intervals() {
17042        let mut sa = vec![10, 11, 12, 13, 14];
17043        let mut buckets = vec![0; 2 * 3];
17044        buckets[buckets_index2(0, 0)] = 2;
17045        buckets[buckets_index2(0, 1)] = 0;
17046        buckets[buckets_index2(1, 0)] = 5;
17047        buckets[buckets_index2(1, 1)] = 2;
17048        buckets[buckets_index2(2, 0)] = 5;
17049        buckets[buckets_index2(2, 1)] = 3;
17050
17051        place_lms_suffixes_interval_32s_2k(&mut sa, 5, 3, 5, &buckets);
17052
17053        assert_eq!(sa, vec![0, 0, 0, 0, 14]);
17054    }
17055
17056    #[test]
17057    fn libsais64_place_lms_suffixes_interval_32s_1k_places_suffixes_by_symbol_bucket() {
17058        let t = vec![0, 1, 1, 2, 2];
17059        let mut sa = vec![1, 2, 3, 4, 99];
17060        let buckets = vec![0, 2, 5];
17061
17062        place_lms_suffixes_interval_32s_1k(&t, &mut sa, 3, 4, &buckets);
17063
17064        assert_eq!(sa, vec![1, 2, 0, 3, 4]);
17065    }
17066
17067    #[test]
17068    fn libsais64_accumulate_counts_helpers_match_prefix_bucket_addition() {
17069        let mut bucket00 = vec![4, 5, 6];
17070        let bucket01 = vec![1, 2, 3];
17071        let bucket02 = vec![7, 8, 9];
17072        let bucket03 = vec![10, 11, 12];
17073        let bucket04 = vec![13, 14, 15];
17074        let bucket05 = vec![16, 17, 18];
17075        let bucket06 = vec![19, 20, 21];
17076        let bucket07 = vec![22, 23, 24];
17077        let bucket08 = vec![25, 26, 27];
17078
17079        accumulate_counts_s32_2(&mut bucket00, &bucket01);
17080        assert_eq!(bucket00, vec![5, 7, 9]);
17081
17082        accumulate_counts_s32_3(&mut bucket00, &bucket01, &bucket02);
17083        assert_eq!(bucket00, vec![13, 17, 21]);
17084
17085        accumulate_counts_s32_4(&mut bucket00, &bucket01, &bucket02, &bucket03);
17086        assert_eq!(bucket00, vec![31, 38, 45]);
17087
17088        accumulate_counts_s32_5(&mut bucket00, &bucket01, &bucket02, &bucket03, &bucket04);
17089        assert_eq!(bucket00, vec![62, 73, 84]);
17090
17091        accumulate_counts_s32_6(
17092            &mut bucket00,
17093            &bucket01,
17094            &bucket02,
17095            &bucket03,
17096            &bucket04,
17097            &bucket05,
17098        );
17099        assert_eq!(bucket00, vec![109, 125, 141]);
17100
17101        accumulate_counts_s32_7(
17102            &mut bucket00,
17103            &bucket01,
17104            &bucket02,
17105            &bucket03,
17106            &bucket04,
17107            &bucket05,
17108            &bucket06,
17109        );
17110        assert_eq!(bucket00, vec![175, 197, 219]);
17111
17112        accumulate_counts_s32_8(
17113            &mut bucket00,
17114            &bucket01,
17115            &bucket02,
17116            &bucket03,
17117            &bucket04,
17118            &bucket05,
17119            &bucket06,
17120            &bucket07,
17121        );
17122        assert_eq!(bucket00, vec![263, 292, 321]);
17123
17124        accumulate_counts_s32_9(
17125            &mut bucket00,
17126            &bucket01,
17127            &bucket02,
17128            &bucket03,
17129            &bucket04,
17130            &bucket05,
17131            &bucket06,
17132            &bucket07,
17133            &bucket08,
17134        );
17135        assert_eq!(bucket00, vec![376, 413, 450]);
17136    }
17137
17138    #[test]
17139    fn libsais64_accumulate_counts_s32_matches_dispatch_for_small_bucket_counts() {
17140        let mut buckets = vec![1, 2, 3, 4, 5, 6, 7, 8];
17141        accumulate_counts_s32(&mut buckets, 2, 2, 4);
17142        assert_eq!(buckets, vec![1, 2, 3, 4, 5, 6, 16, 20]);
17143    }
17144
17145    #[test]
17146    fn libsais64_accumulate_counts_s32_matches_dispatch_for_nine_buckets() {
17147        let mut buckets = vec![
17148            1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 6, 60, 7, 70, 8, 80, 9, 90,
17149        ];
17150        accumulate_counts_s32(&mut buckets, 2, 2, 9);
17151        assert_eq!(
17152            buckets,
17153            vec![1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 6, 60, 7, 70, 8, 80, 45, 450]
17154        );
17155    }
17156
17157    #[test]
17158    fn libsais64_accumulate_counts_s32_matches_chunked_nine_then_tail_behavior() {
17159        let mut buckets = (1..=11).collect::<Vec<SaSint>>();
17160        accumulate_counts_s32(&mut buckets, 1, 1, 11);
17161        assert_eq!(buckets, vec![1, 2, 3, 4, 5, 6, 7, 8, 45, 10, 66]);
17162    }
17163
17164    #[test]
17165    fn libsais64_final_sorting_scan_left_to_right_32s_block_omp_places_cached_suffixes() {
17166        let t = vec![0, 1, 2, 1, 0];
17167        let mut sa = vec![1, 2, 0, 0];
17168        let mut induction_bucket = vec![0, 1, 3];
17169        let mut cache = vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE];
17170
17171        final_sorting_scan_left_to_right_32s_block_omp(
17172            &t,
17173            &mut sa,
17174            &mut induction_bucket,
17175            &mut cache,
17176            0,
17177            2,
17178            2,
17179        );
17180
17181        assert_eq!(sa[0] & SAINT_MAX, 0);
17182        assert_eq!(sa[1] & SAINT_MAX, 1);
17183        assert_eq!(induction_bucket[0], 1);
17184        assert_eq!(induction_bucket[1], 2);
17185    }
17186
17187    #[test]
17188    fn libsais64_final_sorting_scan_left_to_right_8u_block_omp_uses_thread_buckets() {
17189        let block_start = 20_000usize;
17190        let block_size = 16_384usize;
17191        let n = block_start + block_size + 8;
17192        let t = vec![1_u8; n];
17193        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
17194
17195        let mut expected_sa = vec![0; n];
17196        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
17197        let mut threaded_sa = expected_sa.clone();
17198        let mut expected_bucket = vec![0; ALPHABET_SIZE];
17199        let mut threaded_bucket = expected_bucket.clone();
17200        let mut thread_state = alloc_thread_state(4).unwrap();
17201
17202        final_sorting_scan_left_to_right_8u(
17203            &t,
17204            &mut expected_sa,
17205            &mut expected_bucket,
17206            block_start as FastSint,
17207            block_size as FastSint,
17208        );
17209        final_sorting_scan_left_to_right_8u_block_omp(
17210            &t,
17211            &mut threaded_sa,
17212            ALPHABET_SIZE as SaSint,
17213            &mut threaded_bucket,
17214            block_start as FastSint,
17215            block_size as FastSint,
17216            4,
17217            &mut thread_state,
17218        );
17219
17220        assert_eq!(threaded_sa, expected_sa);
17221        assert_eq!(threaded_bucket, expected_bucket);
17222    }
17223
17224    #[test]
17225    fn libsais64_final_sorting_scan_right_to_left_32s_block_omp_runs_block_pipeline() {
17226        let t = vec![0, 1, 2, 1, 0];
17227        let mut sa = vec![0, 2, 0, 0];
17228        let mut induction_bucket = vec![1, 2, 3];
17229        let mut expected_sa = sa.clone();
17230        let mut expected_bucket = induction_bucket.clone();
17231        let mut cache = vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE];
17232
17233        final_sorting_scan_right_to_left_32s(&t, &mut expected_sa, &mut expected_bucket, 0, 2);
17234        final_sorting_scan_right_to_left_32s_block_omp(
17235            &t,
17236            &mut sa,
17237            &mut induction_bucket,
17238            &mut cache,
17239            0,
17240            2,
17241            2,
17242        );
17243
17244        assert_eq!(sa, expected_sa);
17245        assert_eq!(induction_bucket, expected_bucket);
17246    }
17247
17248    #[test]
17249    fn libsais64_bwt_copy_8u_copies_low_bytes_from_suffix_array_storage() {
17250        let a = vec![65, 255, 256, -1];
17251        let mut u = vec![0_u8; 4];
17252
17253        bwt_copy_8u(&mut u, &a, 4);
17254
17255        assert_eq!(u, vec![65, 255, 0, 255]);
17256    }
17257
17258    #[test]
17259    fn libsais64_bwt_copy_8u_omp_matches_sequential_copy() {
17260        let a = vec![1, 2, 3, 4, 5];
17261        let mut u = vec![0_u8; 5];
17262
17263        bwt_copy_8u_omp(&mut u, &a, 5, 4);
17264
17265        assert_eq!(u, vec![1, 2, 3, 4, 5]);
17266    }
17267
17268    #[test]
17269    fn libsais64_conversion_helpers_use_little_endian_word_layout() {
17270        let s = vec![11_u32, 22, 33, 44];
17271        let mut d = vec![0_u64; 4];
17272        convert_32u_to_64u(&s, &mut d, 1, 2);
17273        assert_eq!(d, vec![0, 22, 33, 0]);
17274
17275        let mut words = vec![5_u32, 6, 7, 8, 0, 0, 0, 0];
17276        convert_inplace_32u_to_64u(&mut words, 0, 4);
17277        assert_eq!(words, vec![5, 0, 6, 0, 7, 0, 8, 0]);
17278        convert_inplace_64u_to_32u(&mut words, 0, 4);
17279        assert_eq!(&words[..4], &[5, 6, 7, 8]);
17280
17281        let mut words = vec![9_u32, 10, 11, 12, 0, 0, 0, 0];
17282        convert_inplace_32u_to_64u_omp(&mut words, 4, 2);
17283        assert_eq!(words, vec![9, 0, 10, 0, 11, 0, 12, 0]);
17284    }
17285
17286    #[test]
17287    fn libsais64_32bit_workspace_sizing_matches_upstream_capacity_rules() {
17288        assert_eq!(libsais64_new_32bit_fs(10, 4), Some(18));
17289        assert_eq!(libsais64_new_32bit_fs(i32::MAX as usize - 4, 100), Some(4));
17290        assert_eq!(libsais64_new_32bit_fs(i32::MAX as usize + 1, 0), None);
17291    }
17292
17293    #[test]
17294    fn libsais64_32bit_suffix_adapter_widens_suffix_array_and_frequency() {
17295        let text = b"banana";
17296        let fs = 4;
17297        let new_fs = libsais64_new_32bit_fs(text.len(), fs).expect("small workspace");
17298        let mut sa64 = vec![-1; text.len() + fs as usize];
17299        let mut freq64 = vec![-1; ALPHABET_SIZE];
17300        let rc64 = libsais64_run_32bit_omp(text, &mut sa64, fs, Some(&mut freq64), 2, false)
17301            .expect("small input uses 32-bit adapter");
17302
17303        let mut sa32 = vec![-1; text.len() + new_fs as usize];
17304        let mut freq32 = vec![-1; ALPHABET_SIZE];
17305        let rc32 = crate::libsais_omp(text, &mut sa32, new_fs, Some(&mut freq32), 2);
17306
17307        assert_eq!(rc64, SaSint::from(rc32));
17308        assert_eq!(
17309            &sa64[..text.len()],
17310            &sa32[..text.len()]
17311                .iter()
17312                .map(|&value| SaSint::from(value as u32))
17313                .collect::<Vec<_>>()
17314        );
17315        assert_eq!(freq64[b'a' as usize], 3);
17316        assert_eq!(freq64[b'b' as usize], 1);
17317        assert_eq!(freq64[b'n' as usize], 2);
17318        assert_eq!(
17319            freq64[..ALPHABET_SIZE],
17320            freq32
17321                .iter()
17322                .map(|&value| SaSint::from(value))
17323                .collect::<Vec<_>>()
17324        );
17325    }
17326
17327    #[test]
17328    fn libsais64_32bit_gsa_adapter_widens_suffix_array_and_frequency() {
17329        let text = b"ban\0ana\0";
17330        let fs = 2;
17331        let mut sa64 = vec![-1; text.len() + fs as usize];
17332        let mut freq64 = vec![-1; ALPHABET_SIZE];
17333
17334        let rc = libsais64_run_32bit_omp(text, &mut sa64, fs, Some(&mut freq64), 2, true)
17335            .expect("small GSA input uses 32-bit adapter");
17336
17337        let mut direct_sa = vec![0; text.len()];
17338        let mut direct_freq = vec![0; ALPHABET_SIZE];
17339        assert_eq!(
17340            crate::libsais_gsa(text, &mut direct_sa, 0, Some(&mut direct_freq)),
17341            0
17342        );
17343        assert_eq!(rc, 0);
17344        assert_eq!(
17345            sa64[..text.len()],
17346            direct_sa
17347                .iter()
17348                .map(|&value| SaSint::from(value as u32))
17349                .collect::<Vec<_>>()
17350        );
17351        assert_eq!(
17352            freq64[..ALPHABET_SIZE],
17353            direct_freq
17354                .iter()
17355                .map(|&value| SaSint::from(value))
17356                .collect::<Vec<_>>()
17357        );
17358    }
17359
17360    #[test]
17361    fn libsais64_32bit_bwt_adapters_widen_frequency_and_aux_samples() {
17362        let text = b"mississippi";
17363        let fs = 6;
17364        let r = 4;
17365
17366        let mut bwt64 = vec![0; text.len()];
17367        let mut freq64 = vec![-1; ALPHABET_SIZE];
17368        let primary64 = libsais64_bwt_run_32bit_omp(text, &mut bwt64, fs, Some(&mut freq64), 2)
17369            .expect("small input uses 32-bit BWT adapter");
17370
17371        let mut bwt32 = vec![0; text.len()];
17372        let mut work32 = vec![0; text.len() + fs as usize * 2 + text.len()];
17373        let mut freq32 = vec![-1; ALPHABET_SIZE];
17374        let primary32 =
17375            crate::libsais_bwt_omp(text, &mut bwt32, &mut work32, 23, Some(&mut freq32), 2);
17376
17377        assert_eq!(primary64, SaSint::from(primary32));
17378        assert_eq!(bwt64, bwt32);
17379        assert_eq!(
17380            freq64[..ALPHABET_SIZE],
17381            freq32
17382                .iter()
17383                .map(|&value| SaSint::from(value))
17384                .collect::<Vec<_>>()
17385        );
17386
17387        let mut aux_bwt64 = vec![0; text.len()];
17388        let mut aux64 = vec![-1; (text.len() - 1) / r as usize + 1];
17389        let mut aux_freq64 = vec![-1; ALPHABET_SIZE];
17390        let rc64 = libsais64_bwt_aux_run_32bit_omp(
17391            text,
17392            &mut aux_bwt64,
17393            fs,
17394            Some(&mut aux_freq64),
17395            r,
17396            &mut aux64,
17397            2,
17398        )
17399        .expect("small input uses 32-bit aux BWT adapter");
17400
17401        let mut aux_bwt32 = vec![0; text.len()];
17402        let mut aux_work32 = vec![0; text.len() + fs as usize * 2 + text.len()];
17403        let mut aux32 = vec![-1; aux64.len()];
17404        let mut aux_freq32 = vec![-1; ALPHABET_SIZE];
17405        let rc32 = crate::libsais_bwt_aux_omp(
17406            text,
17407            &mut aux_bwt32,
17408            &mut aux_work32,
17409            23,
17410            Some(&mut aux_freq32),
17411            r as i32,
17412            &mut aux32,
17413            2,
17414        );
17415
17416        assert_eq!(rc64, SaSint::from(rc32));
17417        assert_eq!(aux_bwt64, aux_bwt32);
17418        assert_eq!(
17419            aux64,
17420            aux32
17421                .iter()
17422                .map(|&value| SaSint::from(value))
17423                .collect::<Vec<_>>()
17424        );
17425        assert_eq!(
17426            aux_freq64[..ALPHABET_SIZE],
17427            aux_freq32
17428                .iter()
17429                .map(|&value| SaSint::from(value))
17430                .collect::<Vec<_>>()
17431        );
17432    }
17433
17434    #[test]
17435    fn libsais64_bwt_copy_8u_omp_uses_block_partition_for_large_inputs() {
17436        let n = 65_600usize;
17437        let a: Vec<SaSint> = (0..n).map(|i| (i * 17) as SaSint).collect();
17438        let mut threaded = vec![0; n];
17439        let mut sequential = vec![0; n];
17440
17441        bwt_copy_8u_omp(&mut threaded, &a, n as SaSint, 4);
17442        bwt_copy_8u(&mut sequential, &a, n as SaSint);
17443
17444        assert_eq!(threaded, sequential);
17445    }
17446
17447    #[test]
17448    fn libsais64_flip_suffix_markers_omp_uses_block_partition_for_large_inputs() {
17449        let n = 65_600usize;
17450        let mut single: Vec<SaSint> = (0..n).map(|i| (i as SaSint) ^ SAINT_MIN).collect();
17451        let mut threaded = single.clone();
17452
17453        flip_suffix_markers_omp(&mut single, n as SaSint, 1);
17454        flip_suffix_markers_omp(&mut threaded, n as SaSint, 4);
17455
17456        assert_eq!(threaded, single);
17457    }
17458
17459    #[test]
17460    fn libsais64_renumber_lms_suffixes_8u_writes_names_into_second_half() {
17461        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17462
17463        let name = renumber_lms_suffixes_8u(&mut sa, 2, 0, 0, 2);
17464
17465        assert_eq!(name, 1);
17466        assert_eq!(sa[2], SAINT_MIN);
17467        assert_eq!(sa[3], SAINT_MIN | 1);
17468    }
17469
17470    #[test]
17471    fn libsais64_renumber_lms_suffixes_8u_matches_upstream_c_helper() {
17472        let mut sa_rust = vec![1 | SAINT_MIN, 3, 0, 0];
17473        let mut sa_c = sa_rust.clone();
17474
17475        let rust_name = renumber_lms_suffixes_8u(&mut sa_rust, 2, 0, 0, 2);
17476        let c_name =
17477            unsafe { probe_libsais64_renumber_lms_suffixes_8u(sa_c.as_mut_ptr(), 2, 0, 0, 2) };
17478
17479        assert_eq!(rust_name, c_name);
17480        assert_eq!(sa_rust, sa_c);
17481    }
17482
17483    #[test]
17484    fn libsais64_gather_marked_lms_suffixes_moves_negative_marked_entries_to_tail() {
17485        let mut sa = vec![0, 0, 1 | SAINT_MIN, 3];
17486
17487        let l = gather_marked_lms_suffixes(&mut sa, 2, 4, 0, 2);
17488
17489        assert_eq!(l, 3);
17490        assert_eq!(sa[3], 1);
17491    }
17492
17493    #[test]
17494    fn libsais64_gather_marked_lms_suffixes_matches_upstream_c_helper() {
17495        let mut sa_rust = vec![0, 0, 1 | SAINT_MIN, 3];
17496        let mut sa_c = sa_rust.clone();
17497
17498        let rust_l = gather_marked_lms_suffixes(&mut sa_rust, 2, 4, 0, 2);
17499        let c_l =
17500            unsafe { probe_libsais64_gather_marked_lms_suffixes(sa_c.as_mut_ptr(), 2, 4, 0, 2) };
17501
17502        assert_eq!(rust_l, c_l);
17503        assert_eq!(sa_rust, sa_c);
17504    }
17505
17506    #[test]
17507    fn libsais64_renumber_lms_suffixes_8u_omp_wraps_sequential_version() {
17508        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17509        let mut thread_state = alloc_thread_state(2).unwrap();
17510
17511        let name = renumber_lms_suffixes_8u_omp(&mut sa, 2, 2, &mut thread_state);
17512
17513        assert_eq!(name, 1);
17514        assert_eq!(sa[2], SAINT_MIN);
17515    }
17516
17517    #[test]
17518    fn libsais64_renumber_lms_suffixes_8u_omp_uses_block_partition_for_large_inputs() {
17519        let m = 65_600usize;
17520        let mut input = vec![0; 2 * m];
17521        for (i, slot) in input[..m].iter_mut().enumerate() {
17522            let suffix = (2 * i + 1) as SaSint;
17523            *slot = if i % 5 == 0 {
17524                suffix | SAINT_MIN
17525            } else {
17526                suffix
17527            };
17528        }
17529
17530        let mut single = input.clone();
17531        let mut threaded = input;
17532        let mut thread_state = alloc_thread_state(4).unwrap();
17533        let single_name = renumber_lms_suffixes_8u(&mut single, m as SaSint, 0, 0, m as FastSint);
17534        let threaded_name =
17535            renumber_lms_suffixes_8u_omp(&mut threaded, m as SaSint, 4, &mut thread_state);
17536
17537        assert_eq!(threaded_name, single_name);
17538        assert_eq!(threaded, single);
17539    }
17540
17541    #[test]
17542    fn libsais64_gather_marked_lms_suffixes_omp_uses_block_partition_for_large_inputs() {
17543        let n = 131_200usize;
17544        let half_n = n >> 1;
17545        let mut input = vec![-77; n];
17546        for (i, slot) in input[..half_n].iter_mut().enumerate() {
17547            let suffix = (3 * i + 1) as SaSint;
17548            *slot = if i % 7 == 0 {
17549                suffix | SAINT_MIN
17550            } else {
17551                suffix
17552            };
17553        }
17554        let marked_count = input[..half_n].iter().filter(|&&value| value < 0).count();
17555
17556        let mut single = input.clone();
17557        let mut threaded = input;
17558        let mut thread_state = alloc_thread_state(4).unwrap();
17559        let _ = gather_marked_lms_suffixes(&mut single, 0, n as FastSint, 0, half_n as FastSint);
17560        gather_marked_lms_suffixes_omp(&mut threaded, n as SaSint, 0, 0, 4, &mut thread_state);
17561
17562        assert_eq!(&threaded[n - marked_count..], &single[n - marked_count..]);
17563    }
17564
17565    #[test]
17566    fn libsais64_renumber_and_gather_lms_suffixes_omp_uses_large_input_paths() {
17567        let m = 65_600usize;
17568        let n = 2 * m;
17569        let mut input = vec![0; n];
17570        for (i, slot) in input[..m].iter_mut().enumerate() {
17571            let suffix = (2 * i + 1) as SaSint;
17572            *slot = if i % 5 == 0 {
17573                suffix | SAINT_MIN
17574            } else {
17575                suffix
17576            };
17577        }
17578
17579        let mut single = input.clone();
17580        let mut threaded = input;
17581        let mut single_state = alloc_thread_state(1).unwrap();
17582        let mut threaded_state = alloc_thread_state(4).unwrap();
17583        let single_name = renumber_and_gather_lms_suffixes_omp(
17584            &mut single,
17585            n as SaSint,
17586            m as SaSint,
17587            0,
17588            1,
17589            &mut single_state,
17590        );
17591        let threaded_name = renumber_and_gather_lms_suffixes_omp(
17592            &mut threaded,
17593            n as SaSint,
17594            m as SaSint,
17595            0,
17596            4,
17597            &mut threaded_state,
17598        );
17599
17600        assert_eq!(threaded_name, single_name);
17601        assert_eq!(threaded, single);
17602    }
17603
17604    #[test]
17605    fn libsais64_renumber_and_gather_lms_suffixes_omp_gathers_when_names_are_not_distinct() {
17606        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17607        let mut thread_state = alloc_thread_state(2).unwrap();
17608
17609        let name = renumber_and_gather_lms_suffixes_omp(&mut sa, 4, 2, 0, 2, &mut thread_state);
17610
17611        assert_eq!(name, 1);
17612        assert_eq!(sa[3], 1);
17613    }
17614
17615    #[test]
17616    fn libsais64_renumber_and_gather_lms_suffixes_omp_matches_upstream_c_helper() {
17617        let mut sa_rust = vec![1 | SAINT_MIN, 3, 0, 0];
17618        let mut sa_c = sa_rust.clone();
17619        let mut thread_state = alloc_thread_state(2).unwrap();
17620
17621        let rust_name =
17622            renumber_and_gather_lms_suffixes_omp(&mut sa_rust, 4, 2, 0, 2, &mut thread_state);
17623        let c_name = unsafe {
17624            probe_libsais64_renumber_and_gather_lms_suffixes_omp(sa_c.as_mut_ptr(), 4, 2, 0, 2)
17625        };
17626
17627        assert_eq!(rust_name, c_name);
17628        assert_eq!(sa_rust, sa_c);
17629    }
17630
17631    #[test]
17632    fn libsais64_renumber_distinct_lms_suffixes_32s_4k_masks_sources_and_writes_second_half() {
17633        let mut sa = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17634
17635        let name = renumber_distinct_lms_suffixes_32s_4k(&mut sa, 2, 1, 0, 2);
17636
17637        assert_eq!(name, 3);
17638        assert_eq!(sa[0], 1);
17639        assert_eq!(sa[1], 3);
17640        assert_eq!(sa[2], 1);
17641        assert_eq!(sa[3], 2 | SAINT_MIN);
17642    }
17643
17644    #[test]
17645    fn libsais64_renumber_distinct_lms_suffixes_32s_4k_matches_upstream_c_helper() {
17646        let mut sa_rust = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17647        let mut sa_c = sa_rust.clone();
17648
17649        let rust_name = renumber_distinct_lms_suffixes_32s_4k(&mut sa_rust, 2, 1, 0, 2);
17650        let c_name = unsafe {
17651            probe_libsais64_renumber_distinct_lms_suffixes_32s_4k(sa_c.as_mut_ptr(), 2, 1, 0, 2)
17652        };
17653
17654        assert_eq!(rust_name, c_name);
17655        assert_eq!(sa_rust, sa_c);
17656    }
17657
17658    #[test]
17659    fn libsais64_mark_distinct_lms_suffixes_32s_propagates_previous_nonzero_marker() {
17660        let mut sa = vec![0, 0, SAINT_MIN | 5, 0, SAINT_MIN | 7];
17661
17662        mark_distinct_lms_suffixes_32s(&mut sa, 2, 0, 3);
17663
17664        assert_eq!(sa[2], 5);
17665        assert_eq!(sa[3], 0);
17666        assert_eq!(sa[4], SAINT_MIN | 7);
17667    }
17668
17669    #[test]
17670    fn libsais64_clamp_lms_suffixes_length_32s_keeps_only_negative_lengths() {
17671        let mut sa = vec![0, 0, SAINT_MIN | 5, 7, SAINT_MIN | 3];
17672
17673        clamp_lms_suffixes_length_32s(&mut sa, 2, 0, 3);
17674
17675        assert_eq!(sa[2], 5);
17676        assert_eq!(sa[3], 0);
17677        assert_eq!(sa[4], 3);
17678    }
17679
17680    #[test]
17681    fn libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp_marks_second_half_when_names_repeat(
17682    ) {
17683        let mut sa = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17684        let mut thread_state = alloc_thread_state(2).unwrap();
17685
17686        let name =
17687            renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(&mut sa, 4, 2, 2, &mut thread_state);
17688
17689        assert_eq!(name, 2);
17690        assert_eq!(sa[2], 1);
17691        assert_eq!(sa[3], SAINT_MIN | 2);
17692    }
17693
17694    #[test]
17695    fn libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp_matches_upstream_c_helper() {
17696        let mut sa_rust = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17697        let mut sa_c = sa_rust.clone();
17698        let mut thread_state = alloc_thread_state(2).unwrap();
17699
17700        let rust_name = renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
17701            &mut sa_rust,
17702            4,
17703            2,
17704            2,
17705            &mut thread_state,
17706        );
17707        let c_name = unsafe {
17708            probe_libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
17709                sa_c.as_mut_ptr(),
17710                4,
17711                2,
17712                2,
17713            )
17714        };
17715
17716        assert_eq!(rust_name, c_name);
17717        assert_eq!(sa_rust, sa_c);
17718    }
17719
17720    #[test]
17721    fn libsais64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp_handles_single_lms_suffix() {
17722        let t = vec![2, 1, 0];
17723        let mut sa = vec![0; t.len()];
17724
17725        let name = renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(&t, &mut sa, 3, 1, 1);
17726
17727        assert_eq!(name, 1);
17728        assert_eq!(sa[1], SAINT_MIN | 1);
17729    }
17730
17731    #[test]
17732    fn libsais64_main_32s_entry_matches_public_c_long_on_6k_branch() {
17733        assert_libsais64_main_32s_entry_matches_public_c_long_for_branch(300);
17734    }
17735
17736    #[test]
17737    fn libsais64_main_32s_entry_matches_public_c_long_on_4k_branch() {
17738        assert_libsais64_main_32s_entry_matches_public_c_long_for_branch(400);
17739    }
17740
17741    #[test]
17742    fn libsais64_main_32s_entry_matches_public_c_long_on_2k_branch() {
17743        assert_libsais64_main_32s_entry_matches_public_c_long_for_branch(700);
17744    }
17745
17746    #[test]
17747    fn libsais64_main_32s_entry_matches_public_c_long_on_1k_branch() {
17748        assert_libsais64_main_32s_entry_matches_public_c_long_for_branch(1501);
17749    }
17750
17751    #[test]
17752    fn libsais64_main_32s_entry_matches_public_c_long_on_recursive_repetitive_6k_case() {
17753        assert_libsais64_main_32s_entry_matches_public_c_long(
17754            make_libsais64_recursive_main_32s_text(24),
17755            300,
17756            0,
17757            true,
17758        );
17759    }
17760
17761    #[test]
17762    fn libsais64_main_32s_entry_matches_public_c_long_on_recursive_repetitive_1k_case() {
17763        assert_libsais64_main_32s_entry_matches_public_c_long(
17764            make_libsais64_recursive_main_32s_text(24),
17765            1501,
17766            0,
17767            true,
17768        );
17769    }
17770
17771    #[test]
17772    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_6k_case() {
17773        assert_libsais64_main_32s_entry_matches_public_c_long(
17774            make_libsais64_large_main_32s_stress_text(1024, 300),
17775            300,
17776            0,
17777            true,
17778        );
17779    }
17780
17781    #[test]
17782    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_6k_case_with_fs() {
17783        assert_libsais64_main_32s_entry_matches_public_c_long(
17784            make_libsais64_large_main_32s_stress_text(1024, 300),
17785            300,
17786            2048,
17787            false,
17788        );
17789    }
17790
17791    #[test]
17792    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_4k_case() {
17793        assert_libsais64_main_32s_entry_matches_public_c_long(
17794            make_libsais64_large_main_32s_stress_text(1024, 400),
17795            400,
17796            0,
17797            true,
17798        );
17799    }
17800
17801    #[test]
17802    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_4k_case_with_fs() {
17803        assert_libsais64_main_32s_entry_matches_public_c_long(
17804            make_libsais64_large_main_32s_stress_text(1024, 400),
17805            400,
17806            2048,
17807            false,
17808        );
17809    }
17810
17811    #[test]
17812    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_2k_case() {
17813        assert_libsais64_main_32s_entry_matches_public_c_long(
17814            make_libsais64_large_main_32s_stress_text(1024, 700),
17815            700,
17816            0,
17817            true,
17818        );
17819    }
17820
17821    #[test]
17822    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_2k_case_with_fs() {
17823        assert_libsais64_main_32s_entry_matches_public_c_long(
17824            make_libsais64_large_main_32s_stress_text(1024, 700),
17825            700,
17826            2048,
17827            false,
17828        );
17829    }
17830
17831    #[test]
17832    fn libsais64_main_32s_entry_matches_public_c_long_on_large_generated_1k_case_with_fs() {
17833        assert_libsais64_main_32s_entry_matches_public_c_long(
17834            make_libsais64_large_main_32s_stress_text(1024, 1501),
17835            1501,
17836            2048,
17837            false,
17838        );
17839    }
17840
17841    #[test]
17842    fn libsais64_reconstruct_lms_suffixes_maps_indices_from_tail_interval() {
17843        let mut sa = vec![0, 1, 2, 7, 11, 13];
17844
17845        reconstruct_lms_suffixes(&mut sa, 6, 3, 0, 3);
17846
17847        assert_eq!(&sa[..3], &[7, 11, 13]);
17848    }
17849
17850    #[test]
17851    fn libsais64_reconstruct_lms_suffixes_omp_wraps_sequential_version() {
17852        let mut sa = vec![0, 1, 2, 7, 11, 13];
17853
17854        reconstruct_lms_suffixes_omp(&mut sa, 6, 3, 2);
17855
17856        assert_eq!(&sa[..3], &[7, 11, 13]);
17857    }
17858
17859    #[test]
17860    fn libsais64_lms_late_omp_wrappers_use_block_partitions_for_large_inputs() {
17861        let m = 65_600usize;
17862        let n = 2 * m;
17863        let mut input = vec![0; n];
17864        for (i, slot) in input[..m].iter_mut().enumerate() {
17865            let suffix = (2 * i + 1) as SaSint;
17866            *slot = if i % 5 == 0 {
17867                suffix | SAINT_MIN
17868            } else {
17869                suffix
17870            };
17871        }
17872
17873        let mut single = input.clone();
17874        let mut threaded = input.clone();
17875        let mut thread_state = alloc_thread_state(4).unwrap();
17876        let single_name = renumber_lms_suffixes_8u(&mut single, m as SaSint, 0, 0, m as FastSint);
17877        let threaded_name =
17878            renumber_lms_suffixes_8u_omp(&mut threaded, m as SaSint, 4, &mut thread_state);
17879        assert_eq!(threaded_name, single_name);
17880        assert_eq!(threaded, single);
17881
17882        let mut single = input.clone();
17883        let mut threaded = input.clone();
17884        let mut single_state = alloc_thread_state(1).unwrap();
17885        let mut threaded_state = alloc_thread_state(4).unwrap();
17886        let single_name = renumber_and_gather_lms_suffixes_omp(
17887            &mut single,
17888            n as SaSint,
17889            m as SaSint,
17890            0,
17891            1,
17892            &mut single_state,
17893        );
17894        let threaded_name = renumber_and_gather_lms_suffixes_omp(
17895            &mut threaded,
17896            n as SaSint,
17897            m as SaSint,
17898            0,
17899            4,
17900            &mut threaded_state,
17901        );
17902        assert_eq!(threaded_name, single_name);
17903        assert_eq!(threaded, single);
17904
17905        let mut single = input.clone();
17906        let mut threaded = input;
17907        let marked_count = single[..m].iter().filter(|&&value| value < 0).count();
17908        let _ = gather_marked_lms_suffixes(&mut single, 0, n as FastSint, 0, m as FastSint);
17909        gather_marked_lms_suffixes_omp(&mut threaded, n as SaSint, 0, 0, 4, &mut thread_state);
17910        assert_eq!(&threaded[n - marked_count..], &single[n - marked_count..]);
17911    }
17912
17913    #[test]
17914    fn libsais64_reconstruct_lms_suffixes_omp_uses_block_partition_for_large_inputs() {
17915        let m = 65_600usize;
17916        let n = 2 * m;
17917        let mut input = vec![0; n];
17918        for (i, slot) in input[..m].iter_mut().enumerate() {
17919            *slot = (m - 1 - i) as SaSint;
17920        }
17921        for (i, slot) in input[m..].iter_mut().enumerate() {
17922            *slot = (i * 17 + 3) as SaSint;
17923        }
17924
17925        let mut single = input.clone();
17926        let mut threaded = input;
17927        reconstruct_lms_suffixes(&mut single, n as SaSint, m as SaSint, 0, m as FastSint);
17928        reconstruct_lms_suffixes_omp(&mut threaded, n as SaSint, m as SaSint, 4);
17929
17930        assert_eq!(threaded, single);
17931    }
17932
17933    #[test]
17934    fn libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_marks_new_unique_names() {
17935        let mut t = vec![0, 0, 0, 0];
17936        let mut sa = vec![0, 2, -1, 5];
17937
17938        let f = renumber_unique_and_nonunique_lms_suffixes_32s(&mut t, &mut sa, 2, 0, 0, 2);
17939
17940        assert_eq!(f, 1);
17941        assert_eq!(t[0], SAINT_MIN);
17942        assert_eq!(sa[2], SAINT_MIN);
17943        assert_eq!(sa[3], 4);
17944    }
17945
17946    #[test]
17947    fn libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_matches_upstream_c_helper() {
17948        let mut t_rust = vec![0, 0, 0, 0];
17949        let mut sa_rust = vec![0, 2, -1, 5];
17950        let mut t_c = t_rust.clone();
17951        let mut sa_c = sa_rust.clone();
17952
17953        let rust_f =
17954            renumber_unique_and_nonunique_lms_suffixes_32s(&mut t_rust, &mut sa_rust, 2, 0, 0, 2);
17955        let c_f = unsafe {
17956            probe_libsais64_renumber_unique_and_nonunique_lms_suffixes_32s(
17957                t_c.as_mut_ptr(),
17958                sa_c.as_mut_ptr(),
17959                2,
17960                0,
17961                0,
17962                2,
17963            )
17964        };
17965
17966        assert_eq!(rust_f, c_f);
17967        assert_eq!(t_rust, t_c);
17968        assert_eq!(sa_rust, sa_c);
17969    }
17970
17971    #[test]
17972    fn libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp_matches_upstream_c_helper() {
17973        let mut t_rust = vec![0, 0, 0, 0];
17974        let mut sa_rust = vec![0, 2, -1, 5];
17975        let mut t_c = t_rust.clone();
17976        let mut sa_c = sa_rust.clone();
17977        let mut thread_state = alloc_thread_state(1).unwrap();
17978
17979        let rust_f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(
17980            &mut t_rust,
17981            &mut sa_rust,
17982            2,
17983            1,
17984            &mut thread_state,
17985        );
17986        let c_f = unsafe {
17987            probe_libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
17988                t_c.as_mut_ptr(),
17989                sa_c.as_mut_ptr(),
17990                2,
17991                1,
17992            )
17993        };
17994
17995        assert_eq!(rust_f, c_f);
17996        assert_eq!(t_rust, t_c);
17997        assert_eq!(sa_rust, sa_c);
17998    }
17999
18000    #[test]
18001    fn libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp_uses_block_partition() {
18002        let m = 65_600usize;
18003        let n = 2 * m;
18004        let t = vec![0; n];
18005        let mut sa = vec![0; n];
18006        for i in 0..m {
18007            sa[i] = (2 * i) as SaSint;
18008            sa[m + i] = if i % 5 == 0 {
18009                -((i as SaSint) + 1)
18010            } else {
18011                i as SaSint + 7
18012            };
18013        }
18014
18015        let mut single_t = t.clone();
18016        let mut single_sa = sa.clone();
18017        let mut threaded_t = t;
18018        let mut threaded_sa = sa;
18019        let mut thread_state = alloc_thread_state(4).unwrap();
18020        let single_f = renumber_unique_and_nonunique_lms_suffixes_32s(
18021            &mut single_t,
18022            &mut single_sa,
18023            m as SaSint,
18024            0,
18025            0,
18026            m as FastSint,
18027        );
18028        let threaded_f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(
18029            &mut threaded_t,
18030            &mut threaded_sa,
18031            m as SaSint,
18032            4,
18033            &mut thread_state,
18034        );
18035
18036        assert_eq!(threaded_f, single_f);
18037        assert_eq!(threaded_t, single_t);
18038        assert_eq!(threaded_sa, single_sa);
18039    }
18040
18041    #[test]
18042    fn libsais64_compact_unique_and_nonunique_lms_suffixes_32s_splits_unique_and_nonunique_ranges()
18043    {
18044        let mut sa = vec![0, 0, 0, 0, SAINT_MIN, 4];
18045        let mut l = 2;
18046        let mut r = 6;
18047
18048        compact_unique_and_nonunique_lms_suffixes_32s(&mut sa, 2, &mut l, &mut r, 0, 2);
18049
18050        assert_eq!(l, 2);
18051        assert_eq!(r, 6);
18052        assert_eq!(sa[2], 0);
18053        assert_eq!(sa[3] & SAINT_MAX, 0);
18054    }
18055
18056    #[test]
18057    fn libsais64_compact_lms_suffixes_32s_omp_runs_renumber_then_compaction() {
18058        let mut t = vec![0, 0, 0, 0];
18059        let mut sa = vec![0, 2, -1, 5, 77, 88];
18060        let mut thread_state = alloc_thread_state(2).unwrap();
18061
18062        let f = compact_lms_suffixes_32s_omp(&mut t, &mut sa, 4, 2, 2, 2, &mut thread_state);
18063
18064        assert_eq!(f, 1);
18065        assert_eq!(sa[2] & SAINT_MAX, 0);
18066        assert_eq!(sa[5], 3);
18067    }
18068
18069    #[test]
18070    fn libsais64_compact_unique_and_nonunique_lms_suffixes_32s_omp_uses_block_partition() {
18071        let n = 131_200usize;
18072        let m = 65_600usize;
18073        let fs = m + 32;
18074        let half_n = n >> 1;
18075        let f = m / 5;
18076        let mut sa = vec![0; n + fs];
18077        for i in 0..half_n {
18078            sa[m + i] = if i % 5 == 0 {
18079                SAINT_MIN | i as SaSint
18080            } else {
18081                i as SaSint + 1
18082            };
18083        }
18084        for i in 0..f {
18085            sa[m - f + i] = (10_000 + i) as SaSint;
18086        }
18087
18088        let mut single = sa.clone();
18089        let mut threaded = sa;
18090        let mut single_state = alloc_thread_state(1).unwrap();
18091        let mut threaded_state = alloc_thread_state(4).unwrap();
18092        compact_unique_and_nonunique_lms_suffixes_32s_omp(
18093            &mut single,
18094            n as SaSint,
18095            m as SaSint,
18096            fs as SaSint,
18097            f as SaSint,
18098            1,
18099            &mut single_state,
18100        );
18101        compact_unique_and_nonunique_lms_suffixes_32s_omp(
18102            &mut threaded,
18103            n as SaSint,
18104            m as SaSint,
18105            fs as SaSint,
18106            f as SaSint,
18107            4,
18108            &mut threaded_state,
18109        );
18110
18111        let unique_dst = n + fs - m;
18112        assert_eq!(
18113            &threaded[unique_dst..unique_dst + f],
18114            &single[unique_dst..unique_dst + f]
18115        );
18116    }
18117
18118    #[test]
18119    fn libsais64_compact_lms_suffixes_32s_omp_uses_large_input_paths() {
18120        let n = 131_200usize;
18121        let m = 65_600usize;
18122        let fs = m + 32;
18123        let t = vec![0; n];
18124        let mut sa = vec![0; n + fs];
18125        for i in 0..m {
18126            sa[i] = (2 * i) as SaSint;
18127            sa[m + i] = if i % 5 == 0 {
18128                -((i as SaSint) + 1)
18129            } else {
18130                i as SaSint + 7
18131            };
18132        }
18133
18134        let mut single_t = t.clone();
18135        let mut single_sa = sa.clone();
18136        let mut threaded_t = t;
18137        let mut threaded_sa = sa;
18138        let mut single_state = alloc_thread_state(1).unwrap();
18139        let mut threaded_state = alloc_thread_state(4).unwrap();
18140        let single_f = compact_lms_suffixes_32s_omp(
18141            &mut single_t,
18142            &mut single_sa,
18143            n as SaSint,
18144            m as SaSint,
18145            fs as SaSint,
18146            1,
18147            &mut single_state,
18148        );
18149        let threaded_f = compact_lms_suffixes_32s_omp(
18150            &mut threaded_t,
18151            &mut threaded_sa,
18152            n as SaSint,
18153            m as SaSint,
18154            fs as SaSint,
18155            4,
18156            &mut threaded_state,
18157        );
18158
18159        assert_eq!(threaded_f, single_f);
18160        assert_eq!(threaded_t, single_t);
18161        let unique_dst = n + fs - m;
18162        let unique_len = usize::try_from(threaded_f).expect("f must be non-negative");
18163        assert_eq!(
18164            &threaded_sa[unique_dst..unique_dst + unique_len],
18165            &single_sa[unique_dst..unique_dst + unique_len]
18166        );
18167    }
18168
18169    #[test]
18170    fn libsais64_merge_unique_lms_suffixes_32s_noops_for_empty_block() {
18171        let mut t = vec![1, SAINT_MIN, 2, SAINT_MIN];
18172        let mut sa = vec![0, 0, 1, 3];
18173        let before_t = t.clone();
18174        let before_sa = sa.clone();
18175
18176        merge_unique_lms_suffixes_32s(&mut t, &mut sa, 4, 1, 0, 0, 0);
18177
18178        assert_eq!(t, before_t);
18179        assert_eq!(sa, before_sa);
18180    }
18181
18182    #[test]
18183    fn libsais64_merge_unique_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18184        let n = 65_600usize;
18185        let m = 1_024usize;
18186        let mut t = vec![1; n];
18187        for i in (0..n).step_by(257) {
18188            t[i] = SAINT_MIN | ((i % 251) as SaSint);
18189        }
18190        let f = t.iter().filter(|&&value| value < 0).count();
18191        let mut sa = vec![-1; n];
18192        let src = n - m - 1;
18193        for i in 0..f {
18194            sa[src + i] = i as SaSint;
18195        }
18196
18197        let mut single_t = t.clone();
18198        let mut single_sa = sa.clone();
18199        let mut threaded_t = t;
18200        let mut threaded_sa = sa;
18201        let mut thread_state = alloc_thread_state(4).unwrap();
18202        merge_unique_lms_suffixes_32s_omp(
18203            &mut single_t,
18204            &mut single_sa,
18205            n as SaSint,
18206            m as SaSint,
18207            1,
18208            &mut [],
18209        );
18210        merge_unique_lms_suffixes_32s_omp(
18211            &mut threaded_t,
18212            &mut threaded_sa,
18213            n as SaSint,
18214            m as SaSint,
18215            4,
18216            &mut thread_state,
18217        );
18218
18219        assert_eq!(threaded_t, single_t);
18220        assert_eq!(threaded_sa, single_sa);
18221    }
18222
18223    #[test]
18224    fn libsais64_merge_nonunique_lms_suffixes_32s_noops_for_empty_block() {
18225        let mut sa = vec![0, 7, 0, 13, 11];
18226        let before = sa.clone();
18227
18228        merge_nonunique_lms_suffixes_32s(&mut sa, 4, 1, 0, 0, 0);
18229
18230        assert_eq!(sa, before);
18231    }
18232
18233    #[test]
18234    fn libsais64_merge_compacted_lms_suffixes_32s_omp_preserves_input_text_and_fills_zero_slots() {
18235        let mut t = vec![1, 2, 3, 4];
18236        let mut sa = vec![0, 1, 2, 3, 4, 5];
18237        let before_t = t.clone();
18238        let mut thread_state = alloc_thread_state(2).unwrap();
18239
18240        merge_compacted_lms_suffixes_32s_omp(&mut t, &mut sa, 4, 1, 1, 2, &mut thread_state);
18241
18242        assert_eq!(t, before_t);
18243        assert_eq!(sa[0], 3);
18244        assert_eq!(sa[1], 1);
18245    }
18246
18247    #[test]
18248    fn libsais64_merge_nonunique_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18249        let n = 131_200usize;
18250        let m = 65_600usize;
18251        let f = 7usize;
18252        let mut sa = vec![1; n];
18253        let zero_count = (0..m).filter(|i| i % 17 == 0).count();
18254        for i in (0..m).step_by(17) {
18255            sa[i] = 0;
18256        }
18257        let src = n - m - 1 + f;
18258        for i in 0..zero_count {
18259            sa[src + i] = 10_000 + i as SaSint;
18260        }
18261
18262        let mut single = sa.clone();
18263        let mut threaded = sa;
18264        let mut thread_state = alloc_thread_state(4).unwrap();
18265        merge_nonunique_lms_suffixes_32s_omp(
18266            &mut single,
18267            n as SaSint,
18268            m as SaSint,
18269            f as SaSint,
18270            1,
18271            &mut [],
18272        );
18273        merge_nonunique_lms_suffixes_32s_omp(
18274            &mut threaded,
18275            n as SaSint,
18276            m as SaSint,
18277            f as SaSint,
18278            4,
18279            &mut thread_state,
18280        );
18281
18282        assert_eq!(threaded, single);
18283    }
18284
18285    #[test]
18286    fn libsais64_merge_compacted_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18287        let n = 131_200usize;
18288        let m = 65_600usize;
18289        let mut t = vec![1; n];
18290        for i in (0..n).step_by(257) {
18291            t[i] = SAINT_MIN | ((i % 251) as SaSint);
18292        }
18293        let f = t.iter().filter(|&&value| value < 0).count();
18294
18295        let mut sa = vec![1; n];
18296        let zero_count = (0..m).filter(|i| i % 17 == 0).count();
18297        for i in (0..m).step_by(17) {
18298            sa[i] = 0;
18299        }
18300        let unique_src = n - m - 1;
18301        for i in 0..f {
18302            sa[unique_src + i] = i as SaSint;
18303        }
18304        for i in 0..zero_count {
18305            sa[unique_src + f + i] = 10_000 + i as SaSint;
18306        }
18307
18308        let mut single_t = t.clone();
18309        let mut single_sa = sa.clone();
18310        let mut threaded_t = t;
18311        let mut threaded_sa = sa;
18312        let mut single_state = alloc_thread_state(1).unwrap();
18313        let mut threaded_state = alloc_thread_state(4).unwrap();
18314        merge_compacted_lms_suffixes_32s_omp(
18315            &mut single_t,
18316            &mut single_sa,
18317            n as SaSint,
18318            m as SaSint,
18319            f as SaSint,
18320            1,
18321            &mut single_state,
18322        );
18323        merge_compacted_lms_suffixes_32s_omp(
18324            &mut threaded_t,
18325            &mut threaded_sa,
18326            n as SaSint,
18327            m as SaSint,
18328            f as SaSint,
18329            4,
18330            &mut threaded_state,
18331        );
18332
18333        assert_eq!(threaded_t, single_t);
18334        assert_eq!(threaded_sa, single_sa);
18335    }
18336
18337    #[test]
18338    fn libsais64_final_bwt_left_to_right_8u_block_omp_uses_thread_buckets() {
18339        let block_start = 20_000usize;
18340        let block_size = 16_384usize;
18341        let n = block_start + block_size + 8;
18342        let t = vec![1_u8; n];
18343        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18344
18345        let mut expected_sa = vec![0; n];
18346        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18347        let mut threaded_sa = expected_sa.clone();
18348        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18349        let mut threaded_bucket = expected_bucket.clone();
18350        let mut thread_state = alloc_thread_state(4).unwrap();
18351
18352        final_bwt_scan_left_to_right_8u(
18353            &t,
18354            &mut expected_sa,
18355            &mut expected_bucket,
18356            block_start as FastSint,
18357            block_size as FastSint,
18358        );
18359        final_bwt_scan_left_to_right_8u_block_omp(
18360            &t,
18361            &mut threaded_sa,
18362            ALPHABET_SIZE as SaSint,
18363            &mut threaded_bucket,
18364            block_start as FastSint,
18365            block_size as FastSint,
18366            4,
18367            &mut thread_state,
18368        );
18369
18370        assert_eq!(threaded_sa, expected_sa);
18371        assert_eq!(threaded_bucket, expected_bucket);
18372    }
18373
18374    #[test]
18375    fn libsais64_final_bwt_aux_left_to_right_8u_block_omp_uses_thread_buckets() {
18376        let block_start = 20_000usize;
18377        let block_size = 16_384usize;
18378        let n = block_start + block_size + 8;
18379        let t = vec![1_u8; n];
18380        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18381
18382        let mut expected_sa = vec![0; n];
18383        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18384        let mut threaded_sa = expected_sa.clone();
18385        let mut expected_i = vec![0; n];
18386        let mut threaded_i = vec![0; n];
18387        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18388        let mut threaded_bucket = expected_bucket.clone();
18389        let mut thread_state = alloc_thread_state(4).unwrap();
18390
18391        final_bwt_aux_scan_left_to_right_8u(
18392            &t,
18393            &mut expected_sa,
18394            0,
18395            &mut expected_i,
18396            &mut expected_bucket,
18397            block_start as FastSint,
18398            block_size as FastSint,
18399        );
18400        final_bwt_aux_scan_left_to_right_8u_block_omp(
18401            &t,
18402            &mut threaded_sa,
18403            ALPHABET_SIZE as SaSint,
18404            0,
18405            &mut threaded_i,
18406            &mut threaded_bucket,
18407            block_start as FastSint,
18408            block_size as FastSint,
18409            4,
18410            &mut thread_state,
18411        );
18412
18413        assert_eq!(threaded_sa, expected_sa);
18414        assert_eq!(threaded_i, expected_i);
18415        assert_eq!(threaded_bucket, expected_bucket);
18416    }
18417
18418    #[test]
18419    fn libsais64_final_sorting_right_to_left_8u_block_omp_uses_thread_buckets() {
18420        let block_start = 20_000usize;
18421        let block_size = 16_384usize;
18422        let n = block_start + block_size + 8;
18423        let t = vec![1_u8; n];
18424        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18425
18426        let mut expected_sa = vec![0; n];
18427        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18428        let mut threaded_sa = expected_sa.clone();
18429        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18430        expected_bucket[1] = n as SaSint;
18431        let mut threaded_bucket = expected_bucket.clone();
18432        let mut thread_state = alloc_thread_state(4).unwrap();
18433
18434        final_sorting_scan_right_to_left_8u(
18435            &t,
18436            &mut expected_sa,
18437            &mut expected_bucket,
18438            block_start as FastSint,
18439            block_size as FastSint,
18440        );
18441        final_sorting_scan_right_to_left_8u_block_omp(
18442            &t,
18443            &mut threaded_sa,
18444            ALPHABET_SIZE as SaSint,
18445            &mut threaded_bucket,
18446            block_start as FastSint,
18447            block_size as FastSint,
18448            4,
18449            &mut thread_state,
18450        );
18451
18452        assert_eq!(threaded_sa, expected_sa);
18453        assert_eq!(threaded_bucket, expected_bucket);
18454    }
18455
18456    #[test]
18457    fn libsais64_final_bwt_right_to_left_8u_block_omp_uses_thread_buckets() {
18458        let block_start = 20_000usize;
18459        let block_size = 16_384usize;
18460        let n = block_start + block_size + 8;
18461        let t = vec![1_u8; n];
18462        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18463
18464        let mut expected_sa = vec![0; n];
18465        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18466        let mut threaded_sa = expected_sa.clone();
18467        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18468        expected_bucket[1] = n as SaSint;
18469        let mut threaded_bucket = expected_bucket.clone();
18470        let mut thread_state = alloc_thread_state(4).unwrap();
18471
18472        final_bwt_scan_right_to_left_8u(
18473            &t,
18474            &mut expected_sa,
18475            &mut expected_bucket,
18476            block_start as FastSint,
18477            block_size as FastSint,
18478        );
18479        final_bwt_scan_right_to_left_8u_block_omp(
18480            &t,
18481            &mut threaded_sa,
18482            ALPHABET_SIZE as SaSint,
18483            &mut threaded_bucket,
18484            block_start as FastSint,
18485            block_size as FastSint,
18486            4,
18487            &mut thread_state,
18488        );
18489
18490        assert_eq!(threaded_sa, expected_sa);
18491        assert_eq!(threaded_bucket, expected_bucket);
18492    }
18493
18494    #[test]
18495    fn libsais64_final_bwt_aux_right_to_left_8u_block_omp_uses_thread_buckets() {
18496        let block_start = 20_000usize;
18497        let block_size = 16_384usize;
18498        let n = block_start + block_size + 8;
18499        let t = vec![1_u8; n];
18500        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18501
18502        let mut expected_sa = vec![0; n];
18503        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18504        let mut threaded_sa = expected_sa.clone();
18505        let mut expected_i = vec![0; n];
18506        let mut threaded_i = vec![0; n];
18507        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18508        expected_bucket[1] = n as SaSint;
18509        let mut threaded_bucket = expected_bucket.clone();
18510        let mut thread_state = alloc_thread_state(4).unwrap();
18511
18512        final_bwt_aux_scan_right_to_left_8u(
18513            &t,
18514            &mut expected_sa,
18515            0,
18516            &mut expected_i,
18517            &mut expected_bucket,
18518            block_start as FastSint,
18519            block_size as FastSint,
18520        );
18521        final_bwt_aux_scan_right_to_left_8u_block_omp(
18522            &t,
18523            &mut threaded_sa,
18524            ALPHABET_SIZE as SaSint,
18525            0,
18526            &mut threaded_i,
18527            &mut threaded_bucket,
18528            block_start as FastSint,
18529            block_size as FastSint,
18530            4,
18531            &mut thread_state,
18532        );
18533
18534        assert_eq!(threaded_sa, expected_sa);
18535        assert_eq!(threaded_i, expected_i);
18536        assert_eq!(threaded_bucket, expected_bucket);
18537    }
18538
18539    #[test]
18540    fn libsais64_final_gsa_right_to_left_8u_block_omp_uses_thread_buckets() {
18541        let block_start = 20_000usize;
18542        let block_size = 16_384usize;
18543        let n = block_start + block_size + 8;
18544        let t = vec![1_u8; n];
18545        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18546
18547        let mut expected_sa = vec![0; n];
18548        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18549        let mut threaded_sa = expected_sa.clone();
18550        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18551        expected_bucket[1] = n as SaSint;
18552        let mut threaded_bucket = expected_bucket.clone();
18553        let mut thread_state = alloc_thread_state(4).unwrap();
18554
18555        final_gsa_scan_right_to_left_8u(
18556            &t,
18557            &mut expected_sa,
18558            &mut expected_bucket,
18559            block_start as FastSint,
18560            block_size as FastSint,
18561        );
18562        final_gsa_scan_right_to_left_8u_block_omp(
18563            &t,
18564            &mut threaded_sa,
18565            ALPHABET_SIZE as SaSint,
18566            &mut threaded_bucket,
18567            block_start as FastSint,
18568            block_size as FastSint,
18569            4,
18570            &mut thread_state,
18571        );
18572
18573        assert_eq!(threaded_sa, expected_sa);
18574        assert_eq!(threaded_bucket, expected_bucket);
18575    }
18576
18577    #[test]
18578    fn libsais64_count_and_gather_lms_suffixes_8u_omp_uses_block_partition_for_large_inputs() {
18579        let n = 65_600usize;
18580        let text: Vec<u8> = (0..n)
18581            .map(|i| 1 + ((i * 37 + i / 17) % 251) as u8)
18582            .collect();
18583
18584        let mut sa_threaded = vec![-99; n];
18585        let mut sa_scalar = vec![-99; n];
18586        let mut buckets_threaded = vec![0; 4 * ALPHABET_SIZE];
18587        let mut buckets_scalar = vec![0; 4 * ALPHABET_SIZE];
18588        let mut thread_state = alloc_thread_state(4).unwrap();
18589
18590        let m_threaded = count_and_gather_lms_suffixes_8u_omp(
18591            &text,
18592            &mut sa_threaded,
18593            n as SaSint,
18594            &mut buckets_threaded,
18595            4,
18596            &mut thread_state,
18597        );
18598        let m_scalar = count_and_gather_lms_suffixes_8u(
18599            &text,
18600            &mut sa_scalar,
18601            n as SaSint,
18602            &mut buckets_scalar,
18603            0,
18604            n as FastSint,
18605        );
18606
18607        assert_eq!(m_threaded, m_scalar);
18608        assert_eq!(
18609            &sa_threaded[n - m_threaded as usize..],
18610            &sa_scalar[n - m_scalar as usize..]
18611        );
18612        assert_eq!(buckets_threaded, buckets_scalar);
18613    }
18614
18615    #[test]
18616    fn libsais64_gather_lms_suffixes_8u_omp_uses_thread_state_for_large_inputs() {
18617        let n = 65_600usize;
18618        let text: Vec<u8> = (0..n)
18619            .map(|i| 1 + ((i * 37 + i / 17) % 251) as u8)
18620            .collect();
18621        let mut thread_state = alloc_thread_state(4).unwrap();
18622        let mut count_sa = vec![-99; n];
18623        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
18624        let m = count_and_gather_lms_suffixes_8u_omp(
18625            &text,
18626            &mut count_sa,
18627            n as SaSint,
18628            &mut buckets,
18629            4,
18630            &mut thread_state,
18631        );
18632
18633        let mut threaded = vec![-99; n];
18634        let mut scalar = vec![-99; n];
18635        gather_lms_suffixes_8u_omp(&text, &mut threaded, n as SaSint, 4, &mut thread_state);
18636        gather_lms_suffixes_8u(
18637            &text,
18638            &mut scalar,
18639            n as SaSint,
18640            n as FastSint - 1,
18641            0,
18642            n as FastSint,
18643        );
18644
18645        assert_eq!(&threaded[n - m as usize..], &scalar[n - m as usize..]);
18646    }
18647
18648    #[test]
18649    fn libsais64_count_and_gather_lms_suffixes_32s_4k_updates_counts_and_suffixes() {
18650        let t = vec![2, 1, 3, 1, 0];
18651        let mut sa = vec![0; t.len()];
18652        let mut buckets = vec![0; 4 * 4];
18653        let m = count_and_gather_lms_suffixes_32s_4k(
18654            &t,
18655            &mut sa,
18656            t.len() as SaSint,
18657            4,
18658            &mut buckets,
18659            0,
18660            t.len() as FastSint,
18661        );
18662        assert!(m >= 0);
18663        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18664    }
18665
18666    #[test]
18667    fn libsais64_count_and_gather_lms_suffixes_32s_2k_updates_counts_and_suffixes() {
18668        let t = vec![2, 1, 3, 1, 0];
18669        let mut sa = vec![0; t.len()];
18670        let mut buckets = vec![0; 2 * 4];
18671        let m = count_and_gather_lms_suffixes_32s_2k(
18672            &t,
18673            &mut sa,
18674            t.len() as SaSint,
18675            4,
18676            &mut buckets,
18677            0,
18678            t.len() as FastSint,
18679        );
18680        assert!(m >= 0);
18681        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18682    }
18683
18684    #[test]
18685    fn libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_updates_counts_and_suffixes() {
18686        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
18687        let mut sa = vec![0; t.len()];
18688        let mut buckets = vec![0; 2 * 4];
18689        let m = count_and_gather_compacted_lms_suffixes_32s_2k(
18690            &t,
18691            &mut sa,
18692            t.len() as SaSint,
18693            4,
18694            &mut buckets,
18695            0,
18696            t.len() as FastSint,
18697        );
18698        assert!(m >= 0);
18699        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18700    }
18701
18702    #[test]
18703    fn libsais64_count_and_gather_lms_suffixes_32s_4k_nofs_omp_wraps_sequential_version() {
18704        let t = vec![2, 1, 3, 1, 0];
18705        let mut sa = vec![0; t.len()];
18706        let mut buckets = vec![0; 4 * 4];
18707        let m = count_and_gather_lms_suffixes_32s_4k_nofs_omp(
18708            &t,
18709            &mut sa,
18710            t.len() as SaSint,
18711            4,
18712            &mut buckets,
18713            2,
18714        );
18715        assert!(m >= 0);
18716        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18717    }
18718
18719    #[test]
18720    fn libsais64_count_and_gather_lms_suffixes_32s_2k_nofs_omp_wraps_sequential_version() {
18721        let t = vec![2, 1, 3, 1, 0];
18722        let mut sa = vec![0; t.len()];
18723        let mut buckets = vec![0; 2 * 4];
18724        let m = count_and_gather_lms_suffixes_32s_2k_nofs_omp(
18725            &t,
18726            &mut sa,
18727            t.len() as SaSint,
18728            4,
18729            &mut buckets,
18730            2,
18731        );
18732        assert!(m >= 0);
18733        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18734    }
18735
18736    #[test]
18737    fn libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp_wraps_sequential_version()
18738    {
18739        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
18740        let mut sa = vec![0; t.len()];
18741        let mut buckets = vec![0; 2 * 4];
18742        let m = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
18743            &t,
18744            &mut sa,
18745            t.len() as SaSint,
18746            4,
18747            &mut buckets,
18748            2,
18749        );
18750        assert!(m >= 0);
18751        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18752    }
18753
18754    #[test]
18755    fn libsais64_count_and_gather_lms_suffixes_32s_nofs_omp_uses_large_input_paths() {
18756        let n = 65_600usize;
18757        let k = 257usize;
18758        let text: Vec<SaSint> = (0..n)
18759            .map(|i| 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint)
18760            .collect();
18761
18762        let mut sa_threaded = vec![-99; n];
18763        let mut sa_scalar = vec![-99; n];
18764        let mut buckets_threaded = vec![0; 4 * k];
18765        let mut buckets_scalar = vec![0; 4 * k];
18766        let m_threaded = count_and_gather_lms_suffixes_32s_4k_nofs_omp(
18767            &text,
18768            &mut sa_threaded,
18769            n as SaSint,
18770            k as SaSint,
18771            &mut buckets_threaded,
18772            4,
18773        );
18774        let m_scalar = count_and_gather_lms_suffixes_32s_4k(
18775            &text,
18776            &mut sa_scalar,
18777            n as SaSint,
18778            k as SaSint,
18779            &mut buckets_scalar,
18780            0,
18781            n as FastSint,
18782        );
18783        assert_eq!(m_threaded, m_scalar);
18784        assert_eq!(
18785            &sa_threaded[n - m_threaded as usize..],
18786            &sa_scalar[n - m_scalar as usize..]
18787        );
18788        assert_eq!(buckets_threaded, buckets_scalar);
18789
18790        let mut sa_threaded = vec![-99; n];
18791        let mut sa_scalar = vec![-99; n];
18792        let mut buckets_threaded = vec![0; 2 * k];
18793        let mut buckets_scalar = vec![0; 2 * k];
18794        let m_threaded = count_and_gather_lms_suffixes_32s_2k_nofs_omp(
18795            &text,
18796            &mut sa_threaded,
18797            n as SaSint,
18798            k as SaSint,
18799            &mut buckets_threaded,
18800            4,
18801        );
18802        let m_scalar = count_and_gather_lms_suffixes_32s_2k(
18803            &text,
18804            &mut sa_scalar,
18805            n as SaSint,
18806            k as SaSint,
18807            &mut buckets_scalar,
18808            0,
18809            n as FastSint,
18810        );
18811        assert_eq!(m_threaded, m_scalar);
18812        assert_eq!(
18813            &sa_threaded[n - m_threaded as usize..],
18814            &sa_scalar[n - m_scalar as usize..]
18815        );
18816        assert_eq!(buckets_threaded, buckets_scalar);
18817    }
18818
18819    #[test]
18820    fn libsais64_count_and_gather_lms_suffixes_32s_fs_omp_uses_large_input_paths() {
18821        let n = 65_600usize;
18822        let k = 257usize;
18823        let text: Vec<SaSint> = (0..n)
18824            .map(|i| 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint)
18825            .collect();
18826        let mut thread_state = alloc_thread_state(4).unwrap();
18827
18828        let mut sa_threaded = vec![-99; n];
18829        let mut sa_scalar = vec![-99; n];
18830        let mut buckets_threaded = vec![0; 4 * k];
18831        let mut buckets_scalar = vec![0; 4 * k];
18832        let m_threaded = count_and_gather_lms_suffixes_32s_4k_fs_omp(
18833            &text,
18834            &mut sa_threaded,
18835            n as SaSint,
18836            k as SaSint,
18837            &mut buckets_threaded,
18838            0,
18839            4,
18840            &mut thread_state,
18841        );
18842        let m_scalar = count_and_gather_lms_suffixes_32s_4k(
18843            &text,
18844            &mut sa_scalar,
18845            n as SaSint,
18846            k as SaSint,
18847            &mut buckets_scalar,
18848            0,
18849            n as FastSint,
18850        );
18851        assert_eq!(m_threaded, m_scalar);
18852        assert_eq!(
18853            &sa_threaded[n - m_threaded as usize..],
18854            &sa_scalar[n - m_scalar as usize..]
18855        );
18856        assert_eq!(buckets_threaded, buckets_scalar);
18857
18858        let mut sa_threaded = vec![-99; n];
18859        let mut sa_scalar = vec![-99; n];
18860        let mut buckets_threaded = vec![0; 2 * k];
18861        let mut buckets_scalar = vec![0; 2 * k];
18862        let m_threaded = count_and_gather_lms_suffixes_32s_2k_fs_omp(
18863            &text,
18864            &mut sa_threaded,
18865            n as SaSint,
18866            k as SaSint,
18867            &mut buckets_threaded,
18868            0,
18869            4,
18870            &mut thread_state,
18871        );
18872        let m_scalar = count_and_gather_lms_suffixes_32s_2k(
18873            &text,
18874            &mut sa_scalar,
18875            n as SaSint,
18876            k as SaSint,
18877            &mut buckets_scalar,
18878            0,
18879            n as FastSint,
18880        );
18881        assert_eq!(m_threaded, m_scalar);
18882        assert_eq!(
18883            &sa_threaded[n - m_threaded as usize..],
18884            &sa_scalar[n - m_scalar as usize..]
18885        );
18886        assert_eq!(buckets_threaded, buckets_scalar);
18887    }
18888
18889    #[test]
18890    fn libsais64_count_and_gather_compacted_lms_suffixes_32s_nofs_omp_uses_large_input_path() {
18891        let n = 65_600usize;
18892        let k = 257usize;
18893        let text: Vec<SaSint> = (0..n)
18894            .map(|i| {
18895                let value = 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint;
18896                if i % 19 == 0 {
18897                    value | SAINT_MIN
18898                } else {
18899                    value
18900                }
18901            })
18902            .collect();
18903
18904        let mut sa_threaded = vec![-99; n];
18905        let mut sa_split = vec![-99; n];
18906        let mut buckets_threaded = vec![0; 2 * k];
18907        let mut buckets_split = vec![0; 2 * k];
18908        let m_threaded = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
18909            &text,
18910            &mut sa_threaded,
18911            n as SaSint,
18912            k as SaSint,
18913            &mut buckets_threaded,
18914            4,
18915        );
18916        count_compacted_lms_suffixes_32s_2k(&text, n as SaSint, k as SaSint, &mut buckets_split);
18917        let m_split = gather_compacted_lms_suffixes_32s(&text, &mut sa_split, n as SaSint);
18918
18919        assert_eq!(m_threaded, m_split);
18920        assert_eq!(
18921            &sa_threaded[n - m_threaded as usize..],
18922            &sa_split[n - m_split as usize..]
18923        );
18924        assert_eq!(buckets_threaded, buckets_split);
18925    }
18926
18927    #[test]
18928    fn libsais64_count_and_gather_compacted_lms_suffixes_32s_fs_omp_uses_large_input_path() {
18929        let n = 65_600usize;
18930        let k = 257usize;
18931        let text: Vec<SaSint> = (0..n)
18932            .map(|i| {
18933                let value = 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint;
18934                if i % 19 == 0 {
18935                    value | SAINT_MIN
18936                } else {
18937                    value
18938                }
18939            })
18940            .collect();
18941
18942        let mut sa_threaded = vec![-99; 2 * n];
18943        let mut sa_scalar = vec![-99; n];
18944        let mut buckets_threaded = vec![0; 2 * k];
18945        let mut buckets_scalar = vec![0; 2 * k];
18946        let mut thread_state = alloc_thread_state(4).unwrap();
18947        count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
18948            &text,
18949            &mut sa_threaded,
18950            n as SaSint,
18951            k as SaSint,
18952            &mut buckets_threaded,
18953            0,
18954            4,
18955            &mut thread_state,
18956        );
18957        let m_scalar = count_and_gather_compacted_lms_suffixes_32s_2k(
18958            &text,
18959            &mut sa_scalar,
18960            n as SaSint,
18961            k as SaSint,
18962            &mut buckets_scalar,
18963            0,
18964            n as FastSint,
18965        );
18966
18967        assert_eq!(
18968            &sa_threaded[n - m_scalar as usize..n],
18969            &sa_scalar[n - m_scalar as usize..]
18970        );
18971        assert_eq!(buckets_threaded, buckets_scalar);
18972    }
18973
18974    #[test]
18975    fn libsais64_plcp_lcp_omp_wrappers_match_single_thread_on_large_inputs() {
18976        let n = 65_600usize;
18977        let text: Vec<u8> = (0..n).map(|i| (1 + (i % 251)) as u8).collect();
18978        let sa: Vec<SaSint> = (0..n as SaSint).collect();
18979
18980        let mut plcp_single = vec![0; n];
18981        let mut plcp_threaded = vec![0; n];
18982        compute_phi_omp(&sa, &mut plcp_single, n as SaSint, 1);
18983        compute_phi_omp(&sa, &mut plcp_threaded, n as SaSint, 4);
18984        assert_eq!(plcp_threaded, plcp_single);
18985
18986        compute_plcp_omp(&text, &mut plcp_single, n as SaSint, 1);
18987        compute_plcp_omp(&text, &mut plcp_threaded, n as SaSint, 4);
18988        assert_eq!(plcp_threaded, plcp_single);
18989
18990        let mut lcp_single = vec![0; n];
18991        let mut lcp_threaded = vec![0; n];
18992        compute_lcp_omp(&plcp_single, &sa, &mut lcp_single, n as SaSint, 1);
18993        compute_lcp_omp(&plcp_threaded, &sa, &mut lcp_threaded, n as SaSint, 4);
18994        assert_eq!(lcp_threaded, lcp_single);
18995    }
18996
18997    fn assert_libsais64_matches_c(text: &[u8]) {
18998        let mut rust_sa = vec![0; text.len()];
18999        let mut c_sa = vec![0; text.len()];
19000
19001        let rust_rc = libsais64(text, &mut rust_sa, 0, None);
19002        let c_rc = unsafe {
19003            probe_public_libsais64(text.as_ptr(), c_sa.as_mut_ptr(), text.len() as SaSint, 0)
19004        };
19005
19006        assert_eq!(rust_rc, c_rc);
19007        assert_eq!(rust_sa, c_sa);
19008    }
19009
19010    fn assert_libsais64_gsa_matches_c(text: &[u8]) {
19011        let mut rust_sa = vec![0; text.len()];
19012        let mut c_sa = vec![0; text.len()];
19013
19014        let rust_rc = libsais64_gsa(text, &mut rust_sa, 0, None);
19015        let c_rc = unsafe {
19016            probe_public_libsais64_gsa(text.as_ptr(), c_sa.as_mut_ptr(), text.len() as SaSint, 0)
19017        };
19018
19019        assert_eq!(rust_rc, c_rc);
19020        assert_eq!(rust_sa, c_sa);
19021    }
19022
19023    fn assert_libsais64_long_matches_c(text: &[SaSint], k: SaSint) {
19024        let mut rust_t = text.to_vec();
19025        let mut c_t = text.to_vec();
19026        let mut rust_sa = vec![0; text.len()];
19027        let mut c_sa = vec![0; text.len()];
19028
19029        let rust_rc = libsais64_long(&mut rust_t, &mut rust_sa, k, 0);
19030        let c_rc = unsafe {
19031            probe_public_libsais64_long(
19032                c_t.as_mut_ptr(),
19033                c_sa.as_mut_ptr(),
19034                c_t.len() as SaSint,
19035                k,
19036                0,
19037            )
19038        };
19039
19040        assert_eq!(rust_rc, c_rc);
19041        assert_eq!(rust_t, c_t);
19042        assert_eq!(rust_sa, c_sa);
19043    }
19044
19045    fn assert_libsais64_bwt_matches_c(text: &[u8]) {
19046        let mut rust_u = vec![0; text.len()];
19047        let mut rust_a = vec![0; text.len()];
19048        let mut c_u = vec![0; text.len()];
19049        let mut c_a = vec![0; text.len()];
19050
19051        let rust_rc = libsais64_bwt(text, &mut rust_u, &mut rust_a, 0, None);
19052        let c_rc = unsafe {
19053            probe_public_libsais64_bwt(
19054                text.as_ptr(),
19055                c_u.as_mut_ptr(),
19056                c_a.as_mut_ptr(),
19057                text.len() as SaSint,
19058                0,
19059            )
19060        };
19061
19062        assert_eq!(rust_rc, c_rc);
19063        assert_eq!(rust_u, c_u);
19064    }
19065
19066    fn assert_libsais64_bwt_aux_matches_c(text: &[u8], r: SaSint) {
19067        let aux_len = if text.is_empty() {
19068            0
19069        } else {
19070            (text.len() - 1) / r as usize + 1
19071        };
19072        let mut rust_u = vec![0; text.len()];
19073        let mut rust_a = vec![0; text.len()];
19074        let mut rust_i = vec![0; aux_len];
19075        let mut c_u = vec![0; text.len()];
19076        let mut c_a = vec![0; text.len()];
19077        let mut c_i = vec![0; aux_len];
19078
19079        let rust_rc = libsais64_bwt_aux(text, &mut rust_u, &mut rust_a, 0, None, r, &mut rust_i);
19080        let c_rc = unsafe {
19081            probe_public_libsais64_bwt_aux(
19082                text.as_ptr(),
19083                c_u.as_mut_ptr(),
19084                c_a.as_mut_ptr(),
19085                text.len() as SaSint,
19086                0,
19087                r,
19088                c_i.as_mut_ptr(),
19089            )
19090        };
19091
19092        assert_eq!(rust_rc, c_rc);
19093        assert_eq!(rust_u, c_u);
19094        assert_eq!(rust_i, c_i);
19095    }
19096
19097    fn assert_libsais64_freq_outputs_match_c(text: &[u8], gsa_text: &[u8]) {
19098        let mut rust_sa = vec![0; text.len()];
19099        let mut c_sa = vec![0; text.len()];
19100        let mut rust_freq = vec![-1; ALPHABET_SIZE];
19101        let mut c_freq = vec![-1; ALPHABET_SIZE];
19102
19103        let rust_rc = libsais64(text, &mut rust_sa, 0, Some(&mut rust_freq));
19104        let c_rc = unsafe {
19105            probe_public_libsais64_freq(
19106                text.as_ptr(),
19107                c_sa.as_mut_ptr(),
19108                text.len() as SaSint,
19109                0,
19110                c_freq.as_mut_ptr(),
19111            )
19112        };
19113        assert_eq!(rust_rc, c_rc);
19114        assert_eq!(rust_sa, c_sa);
19115        assert_eq!(rust_freq, c_freq);
19116
19117        let mut rust_gsa = vec![0; gsa_text.len()];
19118        let mut c_gsa = vec![0; gsa_text.len()];
19119        rust_freq.fill(-1);
19120        c_freq.fill(-1);
19121        let rust_rc = libsais64_gsa(gsa_text, &mut rust_gsa, 0, Some(&mut rust_freq));
19122        let c_rc = unsafe {
19123            probe_public_libsais64_gsa_freq(
19124                gsa_text.as_ptr(),
19125                c_gsa.as_mut_ptr(),
19126                gsa_text.len() as SaSint,
19127                0,
19128                c_freq.as_mut_ptr(),
19129            )
19130        };
19131        assert_eq!(rust_rc, c_rc);
19132        assert_eq!(rust_gsa, c_gsa);
19133        assert_eq!(rust_freq, c_freq);
19134
19135        let mut rust_u = vec![0; text.len()];
19136        let mut rust_a = vec![0; text.len()];
19137        let mut c_u = vec![0; text.len()];
19138        let mut c_a = vec![0; text.len()];
19139        rust_freq.fill(-1);
19140        c_freq.fill(-1);
19141        let rust_rc = libsais64_bwt(text, &mut rust_u, &mut rust_a, 0, Some(&mut rust_freq));
19142        let c_rc = unsafe {
19143            probe_public_libsais64_bwt_freq(
19144                text.as_ptr(),
19145                c_u.as_mut_ptr(),
19146                c_a.as_mut_ptr(),
19147                text.len() as SaSint,
19148                0,
19149                c_freq.as_mut_ptr(),
19150            )
19151        };
19152        assert_eq!(rust_rc, c_rc);
19153        assert_eq!(rust_u, c_u);
19154        assert_eq!(rust_freq, c_freq);
19155
19156        let r = 4;
19157        let aux_len = (text.len() - 1) / r as usize + 1;
19158        let mut rust_i = vec![0; aux_len];
19159        let mut c_i = vec![0; aux_len];
19160        rust_freq.fill(-1);
19161        c_freq.fill(-1);
19162        let rust_rc = libsais64_bwt_aux(
19163            text,
19164            &mut rust_u,
19165            &mut rust_a,
19166            0,
19167            Some(&mut rust_freq),
19168            r,
19169            &mut rust_i,
19170        );
19171        let c_rc = unsafe {
19172            probe_public_libsais64_bwt_aux_freq(
19173                text.as_ptr(),
19174                c_u.as_mut_ptr(),
19175                c_a.as_mut_ptr(),
19176                text.len() as SaSint,
19177                0,
19178                c_freq.as_mut_ptr(),
19179                r,
19180                c_i.as_mut_ptr(),
19181            )
19182        };
19183        assert_eq!(rust_rc, c_rc);
19184        assert_eq!(rust_u, c_u);
19185        assert_eq!(rust_i, c_i);
19186        assert_eq!(rust_freq, c_freq);
19187    }
19188
19189    fn assert_libsais64_unbwt_matches_c(text: &[u8]) {
19190        let mut bwt = vec![0; text.len()];
19191        let mut work = vec![0; text.len()];
19192        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, None);
19193        assert!(primary >= 0);
19194
19195        let mut rust_u = vec![0; text.len()];
19196        let mut rust_a = vec![0; text.len() + 1];
19197        let mut c_u = vec![0; text.len()];
19198        let mut c_a = vec![0; text.len() + 1];
19199
19200        let rust_rc = libsais64_unbwt(&bwt, &mut rust_u, &mut rust_a, None, primary);
19201        let c_rc = unsafe {
19202            probe_public_libsais64_unbwt(
19203                bwt.as_ptr(),
19204                c_u.as_mut_ptr(),
19205                c_a.as_mut_ptr(),
19206                bwt.len() as SaSint,
19207                primary,
19208            )
19209        };
19210
19211        assert_eq!(rust_rc, c_rc);
19212        assert_eq!(rust_u, c_u);
19213        assert_eq!(rust_u, text);
19214    }
19215
19216    fn assert_libsais64_unbwt_aux_matches_c(text: &[u8], r: SaSint) {
19217        let mut bwt = vec![0; text.len()];
19218        let mut work = vec![0; text.len()];
19219        let mut aux = vec![0; (text.len() - 1) / r as usize + 1];
19220        let bwt_rc = libsais64_bwt_aux(text, &mut bwt, &mut work, 0, None, r, &mut aux);
19221        assert_eq!(bwt_rc, 0);
19222
19223        let mut rust_u = vec![0; text.len()];
19224        let mut rust_a = vec![0; text.len() + 1];
19225        let mut c_u = vec![0; text.len()];
19226        let mut c_a = vec![0; text.len() + 1];
19227
19228        let rust_rc = libsais64_unbwt_aux(&bwt, &mut rust_u, &mut rust_a, None, r, &aux);
19229        let c_rc = unsafe {
19230            probe_public_libsais64_unbwt_aux(
19231                bwt.as_ptr(),
19232                c_u.as_mut_ptr(),
19233                c_a.as_mut_ptr(),
19234                bwt.len() as SaSint,
19235                r,
19236                aux.as_ptr(),
19237            )
19238        };
19239
19240        assert_eq!(rust_rc, c_rc);
19241        assert_eq!(rust_u, c_u);
19242        assert_eq!(rust_u, text);
19243    }
19244
19245    fn assert_libsais64_unbwt_freq_matches_c(text: &[u8]) {
19246        let mut freq = vec![0; ALPHABET_SIZE];
19247        let mut bwt = vec![0; text.len()];
19248        let mut work = vec![0; text.len()];
19249        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
19250        assert!(primary >= 0);
19251
19252        let mut rust_u = vec![0; text.len()];
19253        let mut rust_a = vec![0; text.len() + 1];
19254        let mut c_u = vec![0; text.len()];
19255        let mut c_a = vec![0; text.len() + 1];
19256
19257        let rust_rc = libsais64_unbwt(&bwt, &mut rust_u, &mut rust_a, Some(&freq), primary);
19258        let c_rc = unsafe {
19259            probe_public_libsais64_unbwt_freq(
19260                bwt.as_ptr(),
19261                c_u.as_mut_ptr(),
19262                c_a.as_mut_ptr(),
19263                bwt.len() as SaSint,
19264                freq.as_ptr(),
19265                primary,
19266            )
19267        };
19268        assert_eq!(rust_rc, c_rc);
19269        assert_eq!(rust_u, c_u);
19270        assert_eq!(rust_u, text);
19271
19272        let r = 4;
19273        let mut aux = vec![0; (text.len() - 1) / r as usize + 1];
19274        let bwt_rc = libsais64_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), r, &mut aux);
19275        assert_eq!(bwt_rc, 0);
19276
19277        rust_u.fill(0);
19278        rust_a.fill(0);
19279        c_u.fill(0);
19280        c_a.fill(0);
19281        let rust_rc = libsais64_unbwt_aux(&bwt, &mut rust_u, &mut rust_a, Some(&freq), r, &aux);
19282        let c_rc = unsafe {
19283            probe_public_libsais64_unbwt_aux_freq(
19284                bwt.as_ptr(),
19285                c_u.as_mut_ptr(),
19286                c_a.as_mut_ptr(),
19287                bwt.len() as SaSint,
19288                freq.as_ptr(),
19289                r,
19290                aux.as_ptr(),
19291            )
19292        };
19293        assert_eq!(rust_rc, c_rc);
19294        assert_eq!(rust_u, c_u);
19295        assert_eq!(rust_u, text);
19296    }
19297
19298    fn assert_libsais64_plcp_lcp_matches_c(text: &[u8]) {
19299        let mut sa = vec![0; text.len()];
19300        let sa_rc = libsais64(text, &mut sa, 0, None);
19301        assert_eq!(sa_rc, 0);
19302
19303        let mut rust_plcp = vec![0; text.len()];
19304        let mut c_plcp = vec![0; text.len()];
19305        let rust_plcp_rc = libsais64_plcp(text, &sa, &mut rust_plcp);
19306        let c_plcp_rc = unsafe {
19307            probe_public_libsais64_plcp(
19308                text.as_ptr(),
19309                sa.as_ptr(),
19310                c_plcp.as_mut_ptr(),
19311                text.len() as SaSint,
19312            )
19313        };
19314        assert_eq!(rust_plcp_rc, c_plcp_rc);
19315        assert_eq!(rust_plcp, c_plcp);
19316
19317        let mut rust_lcp = vec![0; text.len()];
19318        let mut c_lcp = vec![0; text.len()];
19319        let rust_lcp_rc = libsais64_lcp(&rust_plcp, &sa, &mut rust_lcp);
19320        let c_lcp_rc = unsafe {
19321            probe_public_libsais64_lcp(
19322                c_plcp.as_ptr(),
19323                sa.as_ptr(),
19324                c_lcp.as_mut_ptr(),
19325                text.len() as SaSint,
19326            )
19327        };
19328        assert_eq!(rust_lcp_rc, c_lcp_rc);
19329        assert_eq!(rust_lcp, c_lcp);
19330    }
19331
19332    fn assert_libsais64_plcp_gsa_matches_c(text: &[u8]) {
19333        let mut sa = vec![0; text.len()];
19334        assert_eq!(libsais64_gsa(text, &mut sa, 0, None), 0);
19335
19336        let mut rust_plcp = vec![0; text.len()];
19337        let mut c_plcp = vec![0; text.len()];
19338        let rust_rc = libsais64_plcp_gsa(text, &sa, &mut rust_plcp);
19339        let c_rc = unsafe {
19340            probe_public_libsais64_plcp_gsa(
19341                text.as_ptr(),
19342                sa.as_ptr(),
19343                c_plcp.as_mut_ptr(),
19344                text.len() as SaSint,
19345            )
19346        };
19347
19348        assert_eq!(rust_rc, c_rc);
19349        assert_eq!(rust_plcp, c_plcp);
19350    }
19351
19352    fn assert_libsais64_bwt_aux_round_trips(text: &[u8], r: SaSint) {
19353        let mut bwt = vec![0; text.len()];
19354        let mut work = vec![0; text.len()];
19355        let mut restored = vec![0; text.len()];
19356        let mut aux = vec![0; (text.len() - 1) / r as usize + 1];
19357
19358        let bwt_rc = libsais64_bwt_aux(text, &mut bwt, &mut work, 0, None, r, &mut aux);
19359        assert_eq!(bwt_rc, 0);
19360
19361        let unbwt_rc = libsais64_unbwt_aux(&bwt, &mut restored, &mut work, None, r, &aux);
19362        assert_eq!(unbwt_rc, 0);
19363        assert_eq!(restored, text);
19364    }
19365
19366    #[test]
19367    fn public_libsais64_matches_upstream_c() {
19368        for text in [
19369            b"".as_slice(),
19370            b"a",
19371            b"banana",
19372            b"mississippi",
19373            b"abracadabra",
19374            b"AAAAAAAAAAAAAAAA",
19375            b"zyxwvutsrqponmlk",
19376        ] {
19377            assert_libsais64_matches_c(text);
19378        }
19379    }
19380
19381    #[test]
19382    fn public_libsais64_bwt_matches_upstream_c() {
19383        for text in [
19384            b"".as_slice(),
19385            b"a",
19386            b"banana",
19387            b"mississippi",
19388            b"abracadabra",
19389            b"AAAAAAAAAAAAAAAA",
19390            b"zyxwvutsrqponmlk",
19391        ] {
19392            assert_libsais64_bwt_matches_c(text);
19393        }
19394    }
19395
19396    #[test]
19397    fn public_libsais64_gsa_matches_upstream_c() {
19398        for text in [
19399            b"\0".as_slice(),
19400            b"banana\0",
19401            b"ban\0ana\0",
19402            b"miss\0issippi\0",
19403            b"a\0a\0a\0",
19404        ] {
19405            assert_libsais64_gsa_matches_c(text);
19406        }
19407    }
19408
19409    #[test]
19410    fn public_libsais64_long_matches_upstream_c() {
19411        for (text, k) in [
19412            (&[][..], 0),
19413            (&[0][..], 1),
19414            (&[1, 2, 1, 0][..], 3),
19415            (&[2, 1, 2, 1, 0][..], 3),
19416            (&[3, 3, 3, 2, 1, 0][..], 4),
19417        ] {
19418            assert_libsais64_long_matches_c(text, k);
19419        }
19420    }
19421
19422    #[test]
19423    fn public_libsais64_plcp_lcp_matches_upstream_c() {
19424        for text in [
19425            b"".as_slice(),
19426            b"a",
19427            b"banana",
19428            b"mississippi",
19429            b"abracadabra",
19430            b"AAAAAAAAAAAAAAAA",
19431            b"zyxwvutsrqponmlk",
19432        ] {
19433            assert_libsais64_plcp_lcp_matches_c(text);
19434        }
19435    }
19436
19437    #[test]
19438    fn public_libsais64_plcp_gsa_matches_upstream_c() {
19439        for text in [
19440            b"\0".as_slice(),
19441            b"banana\0",
19442            b"ban\0ana\0",
19443            b"miss\0issippi\0",
19444            b"a\0a\0a\0",
19445        ] {
19446            assert_libsais64_plcp_gsa_matches_c(text);
19447        }
19448    }
19449
19450    #[test]
19451    fn libsais64_bwt_and_unbwt_round_trip_small_text() {
19452        let t = b"banana";
19453        let mut bwt = vec![0u8; t.len()];
19454        let mut a = vec![0; t.len()];
19455
19456        let primary = libsais64_bwt(t, &mut bwt, &mut a, 0, None);
19457        assert!(primary > 0);
19458
19459        let mut restored = vec![0u8; t.len()];
19460        let result = libsais64_unbwt(&bwt, &mut restored, &mut a, None, primary);
19461
19462        assert_eq!(result, 0);
19463        assert_eq!(restored, t);
19464    }
19465
19466    #[test]
19467    fn libsais64_bwt_aux_and_unbwt_aux_round_trip_small_text() {
19468        let t = b"mississippi";
19469        let mut bwt = vec![0u8; t.len()];
19470        let mut a = vec![0; t.len()];
19471        let mut samples = vec![0; 4];
19472
19473        let result = libsais64_bwt_aux(t, &mut bwt, &mut a, 0, None, 4, &mut samples);
19474        assert_eq!(result, 0);
19475
19476        let mut restored = vec![0u8; t.len()];
19477        let result = libsais64_unbwt_aux(&bwt, &mut restored, &mut a, None, 4, &samples);
19478
19479        assert_eq!(result, 0);
19480        assert_eq!(restored, t);
19481    }
19482
19483    #[test]
19484    fn libsais64_bwt_aux_and_unbwt_aux_omp_round_trip_small_text() {
19485        let t = b"mississippi";
19486        let mut bwt = vec![0u8; t.len()];
19487        let mut a = vec![0; t.len()];
19488        let mut samples = vec![0; 4];
19489
19490        let result = libsais64_bwt_aux(t, &mut bwt, &mut a, 0, None, 4, &mut samples);
19491        assert_eq!(result, 0);
19492
19493        let mut restored = vec![0u8; t.len()];
19494        let result = libsais64_unbwt_aux_omp(&bwt, &mut restored, &mut a, None, 4, &samples, 2);
19495
19496        assert_eq!(result, 0);
19497        assert_eq!(restored, t);
19498    }
19499
19500    #[test]
19501    fn libsais64_real_world_round_trip_on_upstream_readme() {
19502        let t = include_bytes!("../libsais/README.md");
19503        let mut bwt = vec![0u8; t.len()];
19504        let mut a = vec![0; t.len()];
19505
19506        let primary = libsais64_bwt(t, &mut bwt, &mut a, 0, None);
19507        assert!(primary > 0);
19508
19509        let mut restored = vec![0u8; t.len()];
19510        let result = libsais64_unbwt(&bwt, &mut restored, &mut a, None, primary);
19511
19512        assert_eq!(result, 0);
19513        assert_eq!(restored, t);
19514    }
19515
19516    #[test]
19517    fn libsais64_real_world_aux_omp_round_trip_on_upstream_c_source() {
19518        let t = include_bytes!("../libsais/src/libsais.c");
19519        let mut bwt = vec![0u8; t.len()];
19520        let mut a = vec![0; t.len()];
19521        let r = 128;
19522        let mut samples = vec![0; (t.len() - 1) / usize::try_from(r).expect("fits") + 1];
19523
19524        let result = libsais64_bwt_aux(t, &mut bwt, &mut a, 0, None, r, &mut samples);
19525        assert_eq!(result, 0);
19526
19527        let mut restored = vec![0u8; t.len()];
19528        let result = libsais64_unbwt_aux_omp(&bwt, &mut restored, &mut a, None, r, &samples, 2);
19529
19530        assert_eq!(result, 0);
19531        assert_eq!(restored, t);
19532    }
19533
19534    #[test]
19535    fn libsais64_bwt_aux_rejects_undersized_sampling_array() {
19536        let t = b"upstream source text";
19537        let mut bwt = vec![0u8; t.len()];
19538        let mut a = vec![0; t.len()];
19539        let mut samples = vec![0; 1];
19540
19541        let result = libsais64_bwt_aux(t, &mut bwt, &mut a, 0, None, 2, &mut samples);
19542
19543        assert_eq!(result, -1);
19544
19545        let result = libsais64_bwt_aux(t, &mut bwt, &mut a, 0, None, 0, &mut samples);
19546
19547        assert_eq!(result, -1);
19548    }
19549
19550    #[test]
19551    fn libsais64_bwt_aux_omp_rejects_invalid_sampling_rate_without_panicking() {
19552        let t = b"upstream source text";
19553        let mut bwt = vec![0u8; t.len()];
19554        let mut a = vec![0; t.len()];
19555        let mut samples = vec![0; 4];
19556
19557        let result = libsais64_bwt_aux_omp(t, &mut bwt, &mut a, 0, None, 0, &mut samples, 2);
19558
19559        assert_eq!(result, -1);
19560    }
19561
19562    #[test]
19563    fn public_libsais64_empty_and_singleton_inputs_follow_public_contract() {
19564        let mut empty_sa = Vec::new();
19565        let mut empty_freq = vec![-1; ALPHABET_SIZE];
19566        assert_eq!(libsais64(b"", &mut empty_sa, 0, Some(&mut empty_freq)), 0);
19567        assert!(empty_freq.iter().all(|&value| value == 0));
19568
19569        empty_freq.fill(-1);
19570        assert_eq!(
19571            libsais64_omp(b"", &mut empty_sa, 0, Some(&mut empty_freq), 2),
19572            0
19573        );
19574        assert!(empty_freq.iter().all(|&value| value == 0));
19575
19576        empty_freq.fill(-1);
19577        assert_eq!(
19578            libsais64_gsa(b"", &mut empty_sa, 0, Some(&mut empty_freq)),
19579            0
19580        );
19581        assert!(empty_freq.iter().all(|&value| value == 0));
19582
19583        let mut empty_bwt = Vec::new();
19584        let mut empty_work = Vec::new();
19585        empty_freq.fill(-1);
19586        assert_eq!(
19587            libsais64_bwt(
19588                b"",
19589                &mut empty_bwt,
19590                &mut empty_work,
19591                0,
19592                Some(&mut empty_freq)
19593            ),
19594            0
19595        );
19596        assert!(empty_freq.iter().all(|&value| value == 0));
19597
19598        let mut empty_aux = vec![-1];
19599        empty_freq.fill(-1);
19600        assert_eq!(
19601            libsais64_bwt_aux(
19602                b"",
19603                &mut empty_bwt,
19604                &mut empty_work,
19605                0,
19606                Some(&mut empty_freq),
19607                2,
19608                &mut empty_aux
19609            ),
19610            0
19611        );
19612        assert_eq!(empty_aux[0], 0);
19613        assert!(empty_freq.iter().all(|&value| value == 0));
19614
19615        let text = b"z";
19616        let mut sa = vec![-1; 1];
19617        let mut freq = vec![-1; ALPHABET_SIZE];
19618        assert_eq!(libsais64(text, &mut sa, 0, Some(&mut freq)), 0);
19619        assert_eq!(sa, vec![0]);
19620        assert_eq!(freq[b'z' as usize], 1);
19621        assert_eq!(freq.iter().sum::<SaSint>(), 1);
19622
19623        sa.fill(-1);
19624        freq.fill(-1);
19625        let mut ctx = create_ctx().expect("context");
19626        assert_eq!(
19627            libsais64_ctx(&mut ctx, text, &mut sa, 0, Some(&mut freq)),
19628            0
19629        );
19630        assert_eq!(sa, vec![0]);
19631        assert_eq!(freq[b'z' as usize], 1);
19632        assert_eq!(freq.iter().sum::<SaSint>(), 1);
19633
19634        sa.fill(-1);
19635        freq.fill(-1);
19636        assert_eq!(libsais64_omp(text, &mut sa, 0, Some(&mut freq), 2), 0);
19637        assert_eq!(sa, vec![0]);
19638        assert_eq!(freq[b'z' as usize], 1);
19639        assert_eq!(freq.iter().sum::<SaSint>(), 1);
19640
19641        let mut gsa_sa = vec![-1; 1];
19642        let mut gsa_freq = vec![-1; ALPHABET_SIZE];
19643        assert_eq!(libsais64_gsa(b"\0", &mut gsa_sa, 0, Some(&mut gsa_freq)), 0);
19644        assert_eq!(gsa_sa, vec![0]);
19645        assert_eq!(gsa_freq[0], 1);
19646        assert_eq!(gsa_freq.iter().sum::<SaSint>(), 1);
19647
19648        let mut bwt = vec![0; 1];
19649        let mut work = vec![0; 1];
19650        freq.fill(-1);
19651        assert_eq!(
19652            libsais64_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq)),
19653            1
19654        );
19655        assert_eq!(bwt, text);
19656        assert_eq!(freq[b'z' as usize], 1);
19657        assert_eq!(freq.iter().sum::<SaSint>(), 1);
19658
19659        let mut aux = vec![-1];
19660        bwt.fill(0);
19661        work.fill(0);
19662        freq.fill(-1);
19663        assert_eq!(
19664            libsais64_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), 2, &mut aux),
19665            0
19666        );
19667        assert_eq!(bwt, text);
19668        assert_eq!(aux[0], 1);
19669        assert_eq!(freq[b'z' as usize], 1);
19670        assert_eq!(freq.iter().sum::<SaSint>(), 1);
19671    }
19672
19673    #[test]
19674    fn public_libsais64_rejects_invalid_aux_sampling_without_panicking() {
19675        let text = b"banana";
19676        let mut u = vec![0; text.len()];
19677        let mut a = vec![0; text.len() + 1];
19678        let mut aux = vec![0; 2];
19679
19680        assert_eq!(
19681            libsais64_bwt_aux(text, &mut u, &mut a, 0, None, 0, &mut aux),
19682            -1
19683        );
19684        assert_eq!(
19685            libsais64_bwt_aux(text, &mut u, &mut a, 0, None, 3, &mut aux),
19686            -1
19687        );
19688        assert_eq!(libsais64_unbwt_aux(text, &mut u, &mut a, None, 0, &aux), -1);
19689        assert_eq!(
19690            libsais64_unbwt_aux_omp(text, &mut u, &mut a, None, 0, &aux, 1),
19691            -1
19692        );
19693    }
19694
19695    #[test]
19696    fn libsais64_unbwt_aux_rejects_invalid_sampling_range() {
19697        let t = b"abc";
19698        let mut u = vec![0u8; t.len()];
19699        let mut a = vec![0; t.len()];
19700
19701        let result = libsais64_unbwt_aux(t, &mut u, &mut a, None, 2, &[0, 4]);
19702
19703        assert_eq!(result, -1);
19704
19705        assert_eq!(libsais64_unbwt_aux(t, &mut u, &mut a, None, 0, &[1]), -1);
19706
19707        let mut ctx = unbwt_create_ctx().expect("context");
19708        assert_eq!(
19709            libsais64_unbwt_aux_ctx(&mut ctx, t, &mut u, &mut a, None, 0, &[1]),
19710            -1
19711        );
19712        assert_eq!(
19713            libsais64_unbwt_aux_omp(t, &mut u, &mut a, None, 0, &[1], 2),
19714            -1
19715        );
19716    }
19717
19718    #[test]
19719    fn public_libsais64_omp_rejects_undersized_suffix_arrays() {
19720        let text = b"banana";
19721        let mut short_sa = vec![0; text.len() - 1];
19722        let mut int_text = vec![1, 2, 1, 0];
19723        let mut short_int_sa = vec![0; int_text.len() - 1];
19724
19725        assert_eq!(libsais64_omp(text, &mut short_sa, 0, None, 1), -1);
19726        assert_eq!(
19727            libsais64_gsa_omp(b"banana\0", &mut short_sa, 0, None, 1),
19728            -1
19729        );
19730        assert_eq!(
19731            libsais64_int_omp(&mut int_text, &mut short_int_sa, 3, 0, 1),
19732            -1
19733        );
19734    }
19735
19736    #[test]
19737    #[ignore = "large real-data regression; requires local yeast FASTA fixture"]
19738    fn public_libsais64_omp_handles_minibwa_yeast_two_strand_index_input() {
19739        let path = "/data/henriksson/github/claude/star/.tmp/yeast_conformance/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa";
19740        let Ok(fasta) = std::fs::read_to_string(path) else {
19741            eprintln!("skipping missing fixture: {path}");
19742            return;
19743        };
19744        let mut forward = Vec::new();
19745        for line in fasta.lines() {
19746            if line.starts_with('>') {
19747                continue;
19748            }
19749            forward.extend(line.as_bytes().iter().filter_map(|&c| match c {
19750                b'A' | b'a' => Some(0),
19751                b'C' | b'c' => Some(1),
19752                b'G' | b'g' => Some(2),
19753                b'T' | b't' => Some(3),
19754                _ => None,
19755            }));
19756        }
19757        assert!(
19758            forward.len() > 12_000_000,
19759            "fixture should exercise the large-input 64-bit path"
19760        );
19761
19762        let mut text = Vec::with_capacity(forward.len() * 2);
19763        text.extend_from_slice(&forward);
19764        text.extend(forward.iter().rev().map(|&c| 3 - c));
19765
19766        const FS: SaSint = 10_000;
19767        let mut sa = vec![0; text.len() + FS as usize];
19768        assert_eq!(libsais64_omp(&text, &mut sa, FS, None, 4), 0);
19769    }
19770
19771    #[test]
19772    #[ignore = "large real-data regression; requires local minibwa yeast fixture"]
19773    fn public_libsais64_omp_matches_plain_on_minibwa_yeast_two_strand_index_input() {
19774        let l2b_path =
19775            "/data/henriksson/github/claude/minibwa/.tmp/compare-yeast-now/ref.split.rust.l2b";
19776        let fasta_path =
19777            "/data/henriksson/github/claude/minibwa/.tmp/large-real/yeast/ref.sanitized.fa";
19778        let forward = if let Ok(bytes) = std::fs::read(l2b_path) {
19779            assert!(bytes.len() >= 64, "short l2b fixture: {l2b_path}");
19780            assert_eq!(&bytes[..4], b"L2B\x01", "bad l2b magic in {l2b_path}");
19781            let n_ctg = u64::from_le_bytes(bytes[8..16].try_into().unwrap()) as usize;
19782            let tot_len = u64::from_le_bytes(bytes[16..24].try_into().unwrap()) as usize;
19783            let n_ambi = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize;
19784            let n_mask = u64::from_le_bytes(bytes[32..40].try_into().unwrap()) as usize;
19785            let n_pac = u64::from_le_bytes(bytes[56..64].try_into().unwrap()) as usize;
19786            let pac_start = 64 + 8 * n_ctg + 16 * n_ambi + 16 * n_mask;
19787            assert!(
19788                bytes.len() >= pac_start + 8 * n_pac,
19789                "truncated l2b pac in {l2b_path}"
19790            );
19791            let mut pac = Vec::with_capacity(n_pac);
19792            for chunk in bytes[pac_start..pac_start + 8 * n_pac].chunks_exact(8) {
19793                pac.push(u64::from_le_bytes(chunk.try_into().unwrap()));
19794            }
19795            (0..tot_len)
19796                .map(|i| ((pac[i >> 5] >> ((i & 31) << 1)) & 3) as u8)
19797                .collect::<Vec<_>>()
19798        } else if let Ok(fasta) = std::fs::read_to_string(fasta_path) {
19799            let mut rng = 11u64;
19800            let mut forward = Vec::new();
19801            for line in fasta.lines() {
19802                if line.starts_with('>') {
19803                    continue;
19804                }
19805                forward.extend(line.bytes().map(|b| {
19806                    let mut c = match b {
19807                        b'A' | b'a' => 0,
19808                        b'C' | b'c' => 1,
19809                        b'G' | b'g' => 2,
19810                        b'T' | b't' | b'U' | b'u' => 3,
19811                        _ => {
19812                            rng = rng.wrapping_add(0x9e3779b97f4a7c15);
19813                            let mut z = rng;
19814                            z = (z ^ (z >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
19815                            z = (z ^ (z >> 27)).wrapping_mul(0x94d049bb133111eb);
19816                            4 | ((z ^ (z >> 31)) & 3) as u8
19817                        }
19818                    };
19819                    if b < b'A' || b > b'Z' {
19820                        c |= 1 << 3;
19821                    }
19822                    c & 3
19823                }));
19824            }
19825            forward
19826        } else {
19827            eprintln!("skipping missing fixtures: {l2b_path} and {fasta_path}");
19828            return;
19829        };
19830        assert!(
19831            forward.len() > 12_000_000,
19832            "fixture should exercise the minibwa yeast index workload"
19833        );
19834
19835        let mut text = Vec::with_capacity(forward.len() * 2);
19836        text.extend_from_slice(&forward);
19837        text.extend(forward.iter().rev().map(|&c| 3 - c));
19838
19839        const FS: SaSint = 10_000;
19840        let mut plain_sa = vec![0; text.len() + FS as usize + 1];
19841        let mut omp_sa = vec![0; text.len() + FS as usize + 1];
19842        assert_eq!(libsais64(&text, &mut plain_sa[1..], FS, None), 0);
19843        assert_eq!(libsais64_omp(&text, &mut omp_sa[1..], FS, None, 4), 0);
19844        plain_sa[0] = text.len() as SaSint;
19845        omp_sa[0] = text.len() as SaSint;
19846        if let Some(i) = plain_sa[..=text.len()]
19847            .iter()
19848            .zip(&omp_sa[..=text.len()])
19849            .position(|(plain, omp)| plain != omp)
19850        {
19851            panic!(
19852                "first suffix-array diff at {i}: plain={} omp={}",
19853                plain_sa[i], omp_sa[i]
19854            );
19855        }
19856    }
19857
19858    #[test]
19859    #[ignore = "large real-data regression; requires local minibwa or STAR yeast FASTA fixture"]
19860    fn direct_libsais64_main_handles_minibwa_yeast_two_strand_index_input() {
19861        let minibwa_path =
19862            "/data/henriksson/github/claude/minibwa/.tmp/large-real/yeast/ref.sanitized.fa";
19863        let star_path = "/data/henriksson/github/claude/star/.tmp/yeast_conformance/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa";
19864        let (path, fasta) = if let Ok(fasta) = std::fs::read_to_string(minibwa_path) {
19865            (minibwa_path, fasta)
19866        } else if let Ok(fasta) = std::fs::read_to_string(star_path) {
19867            (star_path, fasta)
19868        } else {
19869            eprintln!("skipping missing fixtures: {minibwa_path} and {star_path}");
19870            return;
19871        };
19872        let mut forward = Vec::new();
19873        for line in fasta.lines() {
19874            if line.starts_with('>') {
19875                continue;
19876            }
19877            forward.extend(line.as_bytes().iter().filter_map(|&c| match c {
19878                b'A' | b'a' => Some(0),
19879                b'C' | b'c' => Some(1),
19880                b'G' | b'g' => Some(2),
19881                b'T' | b't' => Some(3),
19882                _ => None,
19883            }));
19884        }
19885        assert!(
19886            forward.len() > 12_000_000,
19887            "fixture {path} should exercise the minibwa yeast index workload"
19888        );
19889
19890        let mut text = Vec::with_capacity(forward.len() * 2);
19891        text.extend_from_slice(&forward);
19892        text.extend(forward.iter().rev().map(|&c| 3 - c));
19893
19894        const FS: SaSint = 10_000;
19895        let mut sa = vec![0; text.len() + FS as usize];
19896        assert_eq!(
19897            libsais64_main(&text, &mut sa, LIBSAIS_FLAGS_NONE, 0, None, FS, None, 1),
19898            0
19899        );
19900    }
19901
19902    #[test]
19903    #[ignore = "large real-data regression; requires local yeast FASTA fixture"]
19904    fn public_libsais64_matches_c_on_minibwa_yeast_two_strand_index_input() {
19905        let path = "/data/henriksson/github/claude/star/.tmp/yeast_conformance/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa";
19906        let Ok(fasta) = std::fs::read_to_string(path) else {
19907            eprintln!("skipping missing fixture: {path}");
19908            return;
19909        };
19910        let mut forward = Vec::new();
19911        for line in fasta.lines() {
19912            if line.starts_with('>') {
19913                continue;
19914            }
19915            forward.extend(line.as_bytes().iter().filter_map(|&c| match c {
19916                b'A' | b'a' => Some(0),
19917                b'C' | b'c' => Some(1),
19918                b'G' | b'g' => Some(2),
19919                b'T' | b't' => Some(3),
19920                _ => None,
19921            }));
19922        }
19923        let mut text = Vec::with_capacity(forward.len() * 2);
19924        text.extend_from_slice(&forward);
19925        text.extend(forward.iter().rev().map(|&c| 3 - c));
19926
19927        const FS: SaSint = 10_000;
19928        let mut rust_sa = vec![0; text.len() + FS as usize];
19929        let mut c_sa = vec![0; text.len() + FS as usize];
19930        let rust_rc = libsais64(&text, &mut rust_sa, FS, None);
19931        let c_rc = unsafe {
19932            probe_public_libsais64(text.as_ptr(), c_sa.as_mut_ptr(), text.len() as SaSint, FS)
19933        };
19934        assert_eq!(rust_rc, c_rc);
19935        if let Some(i) = rust_sa[..text.len()]
19936            .iter()
19937            .zip(&c_sa[..text.len()])
19938            .position(|(r, c)| r != c)
19939        {
19940            panic!(
19941                "first suffix-array diff at {i}: rust={} c={}",
19942                rust_sa[i], c_sa[i]
19943            );
19944        }
19945    }
19946
19947    #[test]
19948    #[ignore = "large real-data regression; requires local yeast FASTA fixture"]
19949    fn public_libsais64_omp_matches_c_on_minibwa_yeast_two_strand_index_input() {
19950        let path = "/data/henriksson/github/claude/star/.tmp/yeast_conformance/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa";
19951        let Ok(fasta) = std::fs::read_to_string(path) else {
19952            eprintln!("skipping missing fixture: {path}");
19953            return;
19954        };
19955        let mut forward = Vec::new();
19956        for line in fasta.lines() {
19957            if line.starts_with('>') {
19958                continue;
19959            }
19960            forward.extend(line.as_bytes().iter().filter_map(|&c| match c {
19961                b'A' | b'a' => Some(0),
19962                b'C' | b'c' => Some(1),
19963                b'G' | b'g' => Some(2),
19964                b'T' | b't' => Some(3),
19965                _ => None,
19966            }));
19967        }
19968        let mut text = Vec::with_capacity(forward.len() * 2);
19969        text.extend_from_slice(&forward);
19970        text.extend(forward.iter().rev().map(|&c| 3 - c));
19971
19972        const FS: SaSint = 10_000;
19973        let mut rust_sa = vec![0; text.len() + FS as usize];
19974        let mut c_sa = vec![0; text.len() + FS as usize];
19975        let rust_rc = libsais64_omp(&text, &mut rust_sa, FS, None, 4);
19976        let c_rc = unsafe {
19977            probe_public_libsais64(text.as_ptr(), c_sa.as_mut_ptr(), text.len() as SaSint, FS)
19978        };
19979        assert_eq!(rust_rc, c_rc);
19980        if let Some(i) = rust_sa[..text.len()]
19981            .iter()
19982            .zip(&c_sa[..text.len()])
19983            .position(|(r, c)| r != c)
19984        {
19985            panic!(
19986                "first omp suffix-array diff at {i}: rust={} c={}",
19987                rust_sa[i], c_sa[i]
19988            );
19989        }
19990    }
19991
19992    #[test]
19993    fn public_libsais64_ctx_rejects_invalid_public_arguments() {
19994        let text = b"banana";
19995        let mut ctx = create_ctx().unwrap();
19996        let mut short_sa = vec![0; text.len() - 1];
19997        let mut full_sa = vec![0; text.len()];
19998        let mut short_freq = vec![0; ALPHABET_SIZE - 1];
19999        let mut short_u = vec![0; text.len() - 1];
20000        let mut full_u = vec![0; text.len()];
20001        let mut short_a = vec![0; text.len() - 1];
20002        let mut full_a = vec![0; text.len()];
20003        let mut aux = vec![0; 2];
20004
20005        assert_eq!(libsais64_ctx(&mut ctx, text, &mut short_sa, 0, None), -1);
20006        assert_eq!(
20007            libsais64_ctx(&mut ctx, text, &mut full_sa, 0, Some(&mut short_freq)),
20008            -1
20009        );
20010        assert_eq!(
20011            libsais64_gsa_ctx(&mut ctx, b"banana", &mut full_sa, 0, None),
20012            -1
20013        );
20014        assert_eq!(
20015            libsais64_gsa_ctx(&mut ctx, b"banana\0", &mut short_sa, 0, None),
20016            -1
20017        );
20018        assert_eq!(
20019            libsais64_bwt_ctx(&mut ctx, text, &mut short_u, &mut full_a, 0, None),
20020            -1
20021        );
20022        assert_eq!(
20023            libsais64_bwt_ctx(&mut ctx, text, &mut full_u, &mut short_a, 0, None),
20024            -1
20025        );
20026        assert_eq!(
20027            libsais64_bwt_ctx(
20028                &mut ctx,
20029                text,
20030                &mut full_u,
20031                &mut full_a,
20032                0,
20033                Some(&mut short_freq)
20034            ),
20035            -1
20036        );
20037        assert_eq!(
20038            libsais64_bwt_aux_ctx(
20039                &mut ctx,
20040                text,
20041                &mut full_u,
20042                &mut full_a,
20043                0,
20044                None,
20045                0,
20046                &mut aux
20047            ),
20048            -1
20049        );
20050        assert_eq!(
20051            libsais64_bwt_aux_ctx(
20052                &mut ctx,
20053                text,
20054                &mut full_u,
20055                &mut full_a,
20056                0,
20057                None,
20058                3,
20059                &mut aux
20060            ),
20061            -1
20062        );
20063        assert_eq!(
20064            libsais64_bwt_aux_ctx(
20065                &mut ctx,
20066                text,
20067                &mut full_u,
20068                &mut full_a,
20069                0,
20070                None,
20071                4,
20072                &mut []
20073            ),
20074            -1
20075        );
20076
20077        let mut missing_thread_state_ctx = Context {
20078            buckets: vec![0; 8 * ALPHABET_SIZE],
20079            thread_state: None,
20080            threads: 2,
20081        };
20082        assert_eq!(
20083            libsais64_ctx(&mut missing_thread_state_ctx, text, &mut full_sa, 0, None),
20084            -2
20085        );
20086
20087        let mut zero_thread_ctx = Context {
20088            buckets: vec![0; 8 * ALPHABET_SIZE],
20089            thread_state: None,
20090            threads: 0,
20091        };
20092        assert_eq!(
20093            libsais64_ctx(&mut zero_thread_ctx, text, &mut full_sa, 0, None),
20094            -2
20095        );
20096
20097        let mut short_thread_state_ctx = create_ctx_main(2).expect("context");
20098        short_thread_state_ctx
20099            .thread_state
20100            .as_mut()
20101            .expect("thread state")
20102            .truncate(1);
20103        assert_eq!(
20104            libsais64_ctx(&mut short_thread_state_ctx, text, &mut full_sa, 0, None),
20105            -2
20106        );
20107    }
20108
20109    #[test]
20110    fn public_libsais64_unbwt_ctx_rejects_invalid_public_arguments() {
20111        let text = b"banana";
20112        let mut bwt = vec![0; text.len()];
20113        let mut work = vec![0; text.len()];
20114        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, None);
20115        let mut ctx = unbwt_create_ctx().unwrap();
20116
20117        let mut short_u = vec![0; text.len() - 1];
20118        let mut full_u = vec![0; text.len()];
20119        let mut short_a = vec![0; text.len() - 1];
20120        let mut full_a = vec![0; text.len()];
20121        let short_freq = vec![0; ALPHABET_SIZE - 1];
20122        let good_aux = vec![primary, 4];
20123
20124        assert_eq!(
20125            libsais64_unbwt_ctx(&mut ctx, &bwt, &mut short_u, &mut full_a, None, primary),
20126            -1
20127        );
20128        assert_eq!(
20129            libsais64_unbwt_ctx(&mut ctx, &bwt, &mut full_u, &mut short_a, None, primary),
20130            -1
20131        );
20132        assert_eq!(
20133            libsais64_unbwt_ctx(
20134                &mut ctx,
20135                &bwt,
20136                &mut full_u,
20137                &mut full_a,
20138                Some(&short_freq),
20139                primary
20140            ),
20141            -1
20142        );
20143        assert_eq!(
20144            libsais64_unbwt_ctx(&mut ctx, &bwt, &mut full_u, &mut full_a, None, 0),
20145            -1
20146        );
20147        assert_eq!(
20148            libsais64_unbwt_aux_ctx(&mut ctx, &bwt, &mut full_u, &mut full_a, None, 3, &good_aux),
20149            -1
20150        );
20151        assert_eq!(
20152            libsais64_unbwt_aux_ctx(
20153                &mut ctx,
20154                &bwt,
20155                &mut full_u,
20156                &mut full_a,
20157                None,
20158                4,
20159                &[primary]
20160            ),
20161            -1
20162        );
20163
20164        let mut malformed_ctx = UnbwtContext {
20165            bucket2: Vec::new(),
20166            fastbits: Vec::new(),
20167            buckets: None,
20168            threads: 1,
20169        };
20170        assert_eq!(
20171            libsais64_unbwt_ctx(
20172                &mut malformed_ctx,
20173                &bwt,
20174                &mut full_u,
20175                &mut full_a,
20176                None,
20177                primary
20178            ),
20179            -2
20180        );
20181
20182        let mut missing_parallel_buckets_ctx = UnbwtContext {
20183            bucket2: vec![0; ALPHABET_SIZE * ALPHABET_SIZE],
20184            fastbits: vec![0; 1 + (1 << UNBWT_FASTBITS)],
20185            buckets: None,
20186            threads: 2,
20187        };
20188        assert_eq!(
20189            libsais64_unbwt_ctx(
20190                &mut missing_parallel_buckets_ctx,
20191                &bwt,
20192                &mut full_u,
20193                &mut full_a,
20194                None,
20195                primary
20196            ),
20197            -2
20198        );
20199    }
20200
20201    #[test]
20202    fn public_libsais64_lcp_helpers_reject_invalid_suffix_entries() {
20203        let text = b"banana";
20204        let mut plcp = vec![0; text.len()];
20205        let mut lcp = vec![0; text.len()];
20206        let int_text = vec![1, 2, 1, 0];
20207        let mut int_plcp = vec![0; int_text.len()];
20208
20209        assert_eq!(libsais64_plcp(text, &[0, 1, -1, 3, 4, 5], &mut plcp), -1);
20210        assert_eq!(libsais64_plcp(text, &[0, 1, 2, 3, 4, 6], &mut plcp), -1);
20211        assert_eq!(libsais64_lcp(&plcp, &[0, 1, -1, 3, 4, 5], &mut lcp), -1);
20212        assert_eq!(libsais64_lcp(&plcp, &[0, 1, 2, 3, 4, 6], &mut lcp), -1);
20213        assert_eq!(
20214            libsais64_plcp_int(&int_text, &[0, 1, -1, 3], &mut int_plcp),
20215            -1
20216        );
20217        assert_eq!(
20218            libsais64_plcp_int_omp(&int_text, &[0, 1, 2, 4], &mut int_plcp, 1),
20219            -1
20220        );
20221    }
20222
20223    #[test]
20224    fn public_libsais64_context_wrappers_match_direct_calls() {
20225        let text = b"banana";
20226        let gsa_text = b"ban\0ana\0";
20227        let mut ctx = create_ctx().unwrap();
20228
20229        let mut direct_sa = vec![0; text.len()];
20230        let mut ctx_sa = vec![0; text.len()];
20231        assert_eq!(libsais64(text, &mut direct_sa, 0, None), 0);
20232        assert_eq!(libsais64_ctx(&mut ctx, text, &mut ctx_sa, 0, None), 0);
20233        assert_eq!(ctx_sa, direct_sa);
20234
20235        let mut direct_gsa = vec![0; gsa_text.len()];
20236        let mut ctx_gsa = vec![0; gsa_text.len()];
20237        assert_eq!(libsais64_gsa(gsa_text, &mut direct_gsa, 0, None), 0);
20238        assert_eq!(
20239            libsais64_gsa_ctx(&mut ctx, gsa_text, &mut ctx_gsa, 0, None),
20240            0
20241        );
20242        assert_eq!(ctx_gsa, direct_gsa);
20243
20244        let mut direct_bwt = vec![0; text.len()];
20245        let mut direct_work = vec![0; text.len()];
20246        let mut ctx_bwt = vec![0; text.len()];
20247        let mut ctx_work = vec![0; text.len()];
20248        assert_eq!(
20249            libsais64_bwt(text, &mut direct_bwt, &mut direct_work, 0, None),
20250            libsais64_bwt_ctx(&mut ctx, text, &mut ctx_bwt, &mut ctx_work, 0, None)
20251        );
20252        assert_eq!(ctx_bwt, direct_bwt);
20253
20254        let mut direct_aux = vec![0; 2];
20255        let mut ctx_aux = vec![0; 2];
20256        assert_eq!(
20257            libsais64_bwt_aux(
20258                text,
20259                &mut direct_bwt,
20260                &mut direct_work,
20261                0,
20262                None,
20263                4,
20264                &mut direct_aux
20265            ),
20266            libsais64_bwt_aux_ctx(
20267                &mut ctx,
20268                text,
20269                &mut ctx_bwt,
20270                &mut ctx_work,
20271                0,
20272                None,
20273                4,
20274                &mut ctx_aux
20275            )
20276        );
20277        assert_eq!(ctx_bwt, direct_bwt);
20278        assert_eq!(ctx_aux, direct_aux);
20279    }
20280
20281    #[test]
20282    fn libsais64_ctx_matches_plain_entry_point_for_small_text() {
20283        let t = b"mississippi";
20284        let mut sa_plain = vec![0; t.len()];
20285        let mut sa_ctx = vec![0; t.len()];
20286        let plain = libsais64(t, &mut sa_plain, 0, None);
20287
20288        let mut ctx = create_ctx().expect("context");
20289        let with_ctx = libsais64_ctx(&mut ctx, t, &mut sa_ctx, 0, None);
20290
20291        assert_eq!(plain, 0);
20292        assert_eq!(with_ctx, 0);
20293        assert_eq!(sa_ctx, sa_plain);
20294    }
20295
20296    #[test]
20297    fn public_libsais64_unbwt_context_wrappers_match_direct_calls() {
20298        let text = b"banana";
20299        let mut bwt = vec![0; text.len()];
20300        let mut work = vec![0; text.len()];
20301        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, None);
20302        let mut ctx = unbwt_create_ctx().unwrap();
20303
20304        let mut direct = vec![0; text.len()];
20305        let mut direct_work = vec![0; text.len() + 1];
20306        let mut via_ctx = vec![0; text.len()];
20307        let mut ctx_work = vec![0; text.len() + 1];
20308        assert_eq!(
20309            libsais64_unbwt(&bwt, &mut direct, &mut direct_work, None, primary),
20310            0
20311        );
20312        assert_eq!(
20313            libsais64_unbwt_ctx(&mut ctx, &bwt, &mut via_ctx, &mut ctx_work, None, primary),
20314            0
20315        );
20316        assert_eq!(via_ctx, direct);
20317
20318        let mut aux = vec![0; 2];
20319        assert_eq!(
20320            libsais64_bwt_aux(text, &mut bwt, &mut work, 0, None, 4, &mut aux),
20321            0
20322        );
20323        assert_eq!(
20324            libsais64_unbwt_aux(&bwt, &mut direct, &mut direct_work, None, 4, &aux),
20325            0
20326        );
20327        assert_eq!(
20328            libsais64_unbwt_aux_ctx(&mut ctx, &bwt, &mut via_ctx, &mut ctx_work, None, 4, &aux),
20329            0
20330        );
20331        assert_eq!(via_ctx, direct);
20332    }
20333
20334    #[test]
20335    fn public_libsais64_ctx_frequency_wrappers_match_direct_calls() {
20336        let text = b"banana";
20337        let gsa_text = b"ban\0ana\0";
20338        let mut ctx = create_ctx().unwrap();
20339
20340        let mut direct_sa = vec![0; text.len()];
20341        let mut ctx_sa = vec![0; text.len()];
20342        let mut direct_freq = vec![-1; ALPHABET_SIZE];
20343        let mut ctx_freq = vec![-1; ALPHABET_SIZE];
20344        assert_eq!(
20345            libsais64(text, &mut direct_sa, 0, Some(&mut direct_freq)),
20346            0
20347        );
20348        assert_eq!(
20349            libsais64_ctx(&mut ctx, text, &mut ctx_sa, 0, Some(&mut ctx_freq)),
20350            0
20351        );
20352        assert_eq!(ctx_sa, direct_sa);
20353        assert_eq!(ctx_freq, direct_freq);
20354
20355        let mut direct_gsa = vec![0; gsa_text.len()];
20356        let mut ctx_gsa = vec![0; gsa_text.len()];
20357        direct_freq.fill(-1);
20358        ctx_freq.fill(-1);
20359        assert_eq!(
20360            libsais64_gsa(gsa_text, &mut direct_gsa, 0, Some(&mut direct_freq)),
20361            0
20362        );
20363        assert_eq!(
20364            libsais64_gsa_ctx(&mut ctx, gsa_text, &mut ctx_gsa, 0, Some(&mut ctx_freq)),
20365            0
20366        );
20367        assert_eq!(ctx_gsa, direct_gsa);
20368        assert_eq!(ctx_freq, direct_freq);
20369
20370        let mut direct_bwt = vec![0; text.len()];
20371        let mut direct_work = vec![0; text.len()];
20372        let mut ctx_bwt = vec![0; text.len()];
20373        let mut ctx_work = vec![0; text.len()];
20374        direct_freq.fill(-1);
20375        ctx_freq.fill(-1);
20376        assert_eq!(
20377            libsais64_bwt(
20378                text,
20379                &mut direct_bwt,
20380                &mut direct_work,
20381                0,
20382                Some(&mut direct_freq)
20383            ),
20384            libsais64_bwt_ctx(
20385                &mut ctx,
20386                text,
20387                &mut ctx_bwt,
20388                &mut ctx_work,
20389                0,
20390                Some(&mut ctx_freq)
20391            )
20392        );
20393        assert_eq!(ctx_bwt, direct_bwt);
20394        assert_eq!(ctx_freq, direct_freq);
20395
20396        let mut direct_aux = vec![0; 2];
20397        let mut ctx_aux = vec![0; 2];
20398        direct_freq.fill(-1);
20399        ctx_freq.fill(-1);
20400        assert_eq!(
20401            libsais64_bwt_aux(
20402                text,
20403                &mut direct_bwt,
20404                &mut direct_work,
20405                0,
20406                Some(&mut direct_freq),
20407                4,
20408                &mut direct_aux
20409            ),
20410            libsais64_bwt_aux_ctx(
20411                &mut ctx,
20412                text,
20413                &mut ctx_bwt,
20414                &mut ctx_work,
20415                0,
20416                Some(&mut ctx_freq),
20417                4,
20418                &mut ctx_aux
20419            )
20420        );
20421        assert_eq!(ctx_bwt, direct_bwt);
20422        assert_eq!(ctx_aux, direct_aux);
20423        assert_eq!(ctx_freq, direct_freq);
20424    }
20425
20426    #[test]
20427    fn public_libsais64_unbwt_ctx_frequency_wrappers_match_direct_calls() {
20428        let text = b"abracadabra";
20429        let mut freq = vec![0; ALPHABET_SIZE];
20430        let mut bwt = vec![0; text.len()];
20431        let mut work = vec![0; text.len()];
20432        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
20433        assert!(primary >= 0);
20434
20435        let mut ctx = unbwt_create_ctx().unwrap();
20436        let mut direct = vec![0; text.len()];
20437        let mut direct_work = vec![0; text.len() + 1];
20438        let mut via_ctx = vec![0; text.len()];
20439        let mut ctx_work = vec![0; text.len() + 1];
20440        assert_eq!(
20441            libsais64_unbwt(&bwt, &mut direct, &mut direct_work, Some(&freq), primary),
20442            libsais64_unbwt_ctx(
20443                &mut ctx,
20444                &bwt,
20445                &mut via_ctx,
20446                &mut ctx_work,
20447                Some(&freq),
20448                primary
20449            )
20450        );
20451        assert_eq!(via_ctx, direct);
20452        assert_eq!(via_ctx, text);
20453
20454        let mut aux = vec![0; (text.len() - 1) / 4 + 1];
20455        assert_eq!(
20456            libsais64_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), 4, &mut aux),
20457            0
20458        );
20459        direct.fill(0);
20460        direct_work.fill(0);
20461        via_ctx.fill(0);
20462        ctx_work.fill(0);
20463        assert_eq!(
20464            libsais64_unbwt_aux(&bwt, &mut direct, &mut direct_work, Some(&freq), 4, &aux),
20465            libsais64_unbwt_aux_ctx(
20466                &mut ctx,
20467                &bwt,
20468                &mut via_ctx,
20469                &mut ctx_work,
20470                Some(&freq),
20471                4,
20472                &aux
20473            )
20474        );
20475        assert_eq!(via_ctx, direct);
20476        assert_eq!(via_ctx, text);
20477    }
20478
20479    #[test]
20480    fn public_libsais64_omp_wrappers_match_direct_calls() {
20481        let text = b"banana";
20482        let gsa_text = b"ban\0ana\0";
20483
20484        let mut direct_sa = vec![0; text.len()];
20485        let mut omp_sa = vec![0; text.len()];
20486        assert_eq!(libsais64(text, &mut direct_sa, 0, None), 0);
20487        assert_eq!(libsais64_omp(text, &mut omp_sa, 0, None, 2), 0);
20488        assert_eq!(omp_sa, direct_sa);
20489        assert_eq!(libsais64_omp(text, &mut omp_sa, 0, None, -1), -1);
20490
20491        let mut direct_gsa = vec![0; gsa_text.len()];
20492        let mut omp_gsa = vec![0; gsa_text.len()];
20493        assert_eq!(libsais64_gsa(gsa_text, &mut direct_gsa, 0, None), 0);
20494        assert_eq!(libsais64_gsa_omp(gsa_text, &mut omp_gsa, 0, None, 2), 0);
20495        assert_eq!(omp_gsa, direct_gsa);
20496        assert_eq!(libsais64_gsa_omp(gsa_text, &mut omp_gsa, 0, None, -1), -1);
20497
20498        let int_text = vec![2, 1, 3, 1, 0];
20499        let mut direct_int_text = int_text.clone();
20500        let mut omp_int_text = int_text.clone();
20501        let mut direct_int_sa = vec![0; int_text.len()];
20502        let mut omp_int_sa = vec![0; int_text.len()];
20503        assert_eq!(
20504            libsais64_int(&mut direct_int_text, &mut direct_int_sa, 4, 0),
20505            0
20506        );
20507        assert_eq!(
20508            libsais64_int_omp(&mut omp_int_text, &mut omp_int_sa, 4, 0, 2),
20509            0
20510        );
20511        assert_eq!(omp_int_sa, direct_int_sa);
20512        assert_eq!(
20513            libsais64_int_omp(&mut omp_int_text, &mut omp_int_sa, 4, 0, -1),
20514            -1
20515        );
20516
20517        let long_text = vec![3, 1, 4, 1, 5, 0];
20518        let mut direct_long_text = long_text.clone();
20519        let mut omp_long_text = long_text.clone();
20520        let mut direct_long_sa = vec![0; long_text.len()];
20521        let mut omp_long_sa = vec![0; long_text.len()];
20522        assert_eq!(
20523            libsais64_long(&mut direct_long_text, &mut direct_long_sa, 6, 0),
20524            0
20525        );
20526        assert_eq!(
20527            libsais64_long_omp(&mut omp_long_text, &mut omp_long_sa, 6, 0, 2),
20528            0
20529        );
20530        assert_eq!(omp_long_sa, direct_long_sa);
20531        assert_eq!(
20532            libsais64_long_omp(&mut omp_long_text, &mut omp_long_sa, 6, 0, -1),
20533            -1
20534        );
20535
20536        let mut direct_bwt = vec![0; text.len()];
20537        let mut direct_work = vec![0; text.len()];
20538        let mut omp_bwt = vec![0; text.len()];
20539        let mut omp_work = vec![0; text.len()];
20540        assert_eq!(
20541            libsais64_bwt(text, &mut direct_bwt, &mut direct_work, 0, None),
20542            libsais64_bwt_omp(text, &mut omp_bwt, &mut omp_work, 0, None, 2)
20543        );
20544        assert_eq!(omp_bwt, direct_bwt);
20545        assert_eq!(
20546            libsais64_bwt_omp(text, &mut omp_bwt, &mut omp_work, 0, None, -1),
20547            -1
20548        );
20549
20550        let mut direct_aux = vec![0; 2];
20551        let mut omp_aux = vec![0; 2];
20552        assert_eq!(
20553            libsais64_bwt_aux(
20554                text,
20555                &mut direct_bwt,
20556                &mut direct_work,
20557                0,
20558                None,
20559                4,
20560                &mut direct_aux
20561            ),
20562            libsais64_bwt_aux_omp(
20563                text,
20564                &mut omp_bwt,
20565                &mut omp_work,
20566                0,
20567                None,
20568                4,
20569                &mut omp_aux,
20570                2
20571            )
20572        );
20573        assert_eq!(omp_bwt, direct_bwt);
20574        assert_eq!(omp_aux, direct_aux);
20575        assert_eq!(
20576            libsais64_bwt_aux_omp(
20577                text,
20578                &mut omp_bwt,
20579                &mut omp_work,
20580                0,
20581                None,
20582                4,
20583                &mut omp_aux,
20584                -1
20585            ),
20586            -1
20587        );
20588    }
20589
20590    #[test]
20591    fn public_libsais64_plcp_omp_wrappers_match_direct_calls() {
20592        let text = b"banana";
20593        let mut sa = vec![0; text.len()];
20594        assert_eq!(libsais64(text, &mut sa, 0, None), 0);
20595
20596        let mut direct_plcp = vec![0; text.len()];
20597        let mut omp_plcp = vec![0; text.len()];
20598        assert_eq!(libsais64_plcp(text, &sa, &mut direct_plcp), 0);
20599        assert_eq!(libsais64_plcp_omp(text, &sa, &mut omp_plcp, 2), 0);
20600        assert_eq!(omp_plcp, direct_plcp);
20601        assert_eq!(libsais64_plcp_omp(text, &sa, &mut omp_plcp, -1), -1);
20602
20603        let mut direct_lcp = vec![0; text.len()];
20604        let mut omp_lcp = vec![0; text.len()];
20605        assert_eq!(libsais64_lcp(&direct_plcp, &sa, &mut direct_lcp), 0);
20606        assert_eq!(libsais64_lcp_omp(&direct_plcp, &sa, &mut omp_lcp, 2), 0);
20607        assert_eq!(omp_lcp, direct_lcp);
20608        assert_eq!(libsais64_lcp_omp(&direct_plcp, &sa, &mut omp_lcp, -1), -1);
20609
20610        let gsa_text = b"ban\0ana\0";
20611        let mut gsa = vec![0; gsa_text.len()];
20612        assert_eq!(libsais64_gsa(gsa_text, &mut gsa, 0, None), 0);
20613        let mut direct_gsa_plcp = vec![0; gsa_text.len()];
20614        let mut omp_gsa_plcp = vec![0; gsa_text.len()];
20615        assert_eq!(libsais64_plcp_gsa(gsa_text, &gsa, &mut direct_gsa_plcp), 0);
20616        assert_eq!(
20617            libsais64_plcp_gsa_omp(gsa_text, &gsa, &mut omp_gsa_plcp, 2),
20618            0
20619        );
20620        assert_eq!(omp_gsa_plcp, direct_gsa_plcp);
20621        assert_eq!(
20622            libsais64_plcp_gsa_omp(gsa_text, &gsa, &mut omp_gsa_plcp, -1),
20623            -1
20624        );
20625
20626        let int_text = vec![2, 1, 3, 1, 0];
20627        let mut int_text_for_sa = int_text.clone();
20628        let mut int_sa = vec![0; int_text.len()];
20629        assert_eq!(libsais64_int(&mut int_text_for_sa, &mut int_sa, 4, 0), 0);
20630        let mut direct_int_plcp = vec![0; int_text.len()];
20631        let mut omp_int_plcp = vec![0; int_text.len()];
20632        assert_eq!(
20633            libsais64_plcp_int(&int_text, &int_sa, &mut direct_int_plcp),
20634            0
20635        );
20636        assert_eq!(
20637            libsais64_plcp_int_omp(&int_text, &int_sa, &mut omp_int_plcp, 2),
20638            0
20639        );
20640        assert_eq!(omp_int_plcp, direct_int_plcp);
20641        assert_eq!(
20642            libsais64_plcp_int_omp(&int_text, &int_sa, &mut omp_int_plcp, -1),
20643            -1
20644        );
20645    }
20646
20647    #[test]
20648    fn public_libsais64_omp_frequency_wrappers_match_direct_calls() {
20649        let text = b"banana";
20650        let gsa_text = b"ban\0ana\0";
20651
20652        let mut direct_sa = vec![0; text.len()];
20653        let mut omp_sa = vec![0; text.len()];
20654        let mut direct_freq = vec![-1; ALPHABET_SIZE];
20655        let mut omp_freq = vec![-1; ALPHABET_SIZE];
20656        assert_eq!(
20657            libsais64(text, &mut direct_sa, 0, Some(&mut direct_freq)),
20658            0
20659        );
20660        assert_eq!(
20661            libsais64_omp(text, &mut omp_sa, 0, Some(&mut omp_freq), 2),
20662            0
20663        );
20664        assert_eq!(omp_sa, direct_sa);
20665        assert_eq!(omp_freq, direct_freq);
20666
20667        let mut direct_gsa = vec![0; gsa_text.len()];
20668        let mut omp_gsa = vec![0; gsa_text.len()];
20669        direct_freq.fill(-1);
20670        omp_freq.fill(-1);
20671        assert_eq!(
20672            libsais64_gsa(gsa_text, &mut direct_gsa, 0, Some(&mut direct_freq)),
20673            0
20674        );
20675        assert_eq!(
20676            libsais64_gsa_omp(gsa_text, &mut omp_gsa, 0, Some(&mut omp_freq), 2),
20677            0
20678        );
20679        assert_eq!(omp_gsa, direct_gsa);
20680        assert_eq!(omp_freq, direct_freq);
20681
20682        let mut direct_bwt = vec![0; text.len()];
20683        let mut direct_work = vec![0; text.len()];
20684        let mut omp_bwt = vec![0; text.len()];
20685        let mut omp_work = vec![0; text.len()];
20686        direct_freq.fill(-1);
20687        omp_freq.fill(-1);
20688        assert_eq!(
20689            libsais64_bwt(
20690                text,
20691                &mut direct_bwt,
20692                &mut direct_work,
20693                0,
20694                Some(&mut direct_freq)
20695            ),
20696            libsais64_bwt_omp(text, &mut omp_bwt, &mut omp_work, 0, Some(&mut omp_freq), 2)
20697        );
20698        assert_eq!(omp_bwt, direct_bwt);
20699        assert_eq!(omp_freq, direct_freq);
20700
20701        let mut direct_aux = vec![0; 2];
20702        let mut omp_aux = vec![0; 2];
20703        direct_freq.fill(-1);
20704        omp_freq.fill(-1);
20705        assert_eq!(
20706            libsais64_bwt_aux(
20707                text,
20708                &mut direct_bwt,
20709                &mut direct_work,
20710                0,
20711                Some(&mut direct_freq),
20712                4,
20713                &mut direct_aux
20714            ),
20715            libsais64_bwt_aux_omp(
20716                text,
20717                &mut omp_bwt,
20718                &mut omp_work,
20719                0,
20720                Some(&mut omp_freq),
20721                4,
20722                &mut omp_aux,
20723                2
20724            )
20725        );
20726        assert_eq!(omp_bwt, direct_bwt);
20727        assert_eq!(omp_aux, direct_aux);
20728        assert_eq!(omp_freq, direct_freq);
20729    }
20730
20731    #[test]
20732    fn public_libsais64_unbwt_omp_frequency_wrappers_match_direct_calls() {
20733        let text = b"abracadabra";
20734        let mut freq = vec![0; ALPHABET_SIZE];
20735        let mut bwt = vec![0; text.len()];
20736        let mut work = vec![0; text.len()];
20737        let primary = libsais64_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
20738        assert!(primary >= 0);
20739
20740        let mut direct = vec![0; text.len()];
20741        let mut direct_work = vec![0; text.len() + 1];
20742        let mut omp = vec![0; text.len()];
20743        let mut omp_work = vec![0; text.len() + 1];
20744        assert_eq!(
20745            libsais64_unbwt(&bwt, &mut direct, &mut direct_work, Some(&freq), primary),
20746            libsais64_unbwt_omp(&bwt, &mut omp, &mut omp_work, Some(&freq), primary, 2)
20747        );
20748        assert_eq!(omp, direct);
20749        assert_eq!(omp, text);
20750
20751        let mut aux = vec![0; (text.len() - 1) / 4 + 1];
20752        assert_eq!(
20753            libsais64_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), 4, &mut aux),
20754            0
20755        );
20756        direct.fill(0);
20757        direct_work.fill(0);
20758        omp.fill(0);
20759        omp_work.fill(0);
20760        assert_eq!(
20761            libsais64_unbwt_aux(&bwt, &mut direct, &mut direct_work, Some(&freq), 4, &aux),
20762            libsais64_unbwt_aux_omp(&bwt, &mut omp, &mut omp_work, Some(&freq), 4, &aux, 2)
20763        );
20764        assert_eq!(omp, direct);
20765        assert_eq!(omp, text);
20766    }
20767
20768    #[test]
20769    fn public_libsais64_bwt_aux_matches_upstream_c() {
20770        for text in [
20771            b"banana".as_slice(),
20772            b"mississippi",
20773            b"abracadabra",
20774            b"AAAAAAAAAAAAAAAA",
20775            b"zyxwvutsrqponmlk",
20776        ] {
20777            assert_libsais64_bwt_aux_matches_c(text, 4);
20778        }
20779    }
20780
20781    #[test]
20782    fn public_libsais64_frequency_outputs_match_upstream_c() {
20783        assert_libsais64_freq_outputs_match_c(b"banana", b"ban\0ana\0");
20784    }
20785
20786    #[test]
20787    fn public_libsais64_unbwt_with_frequency_matches_upstream_c() {
20788        assert_libsais64_unbwt_freq_matches_c(b"abracadabra");
20789    }
20790
20791    #[test]
20792    fn public_libsais64_unbwt_matches_upstream_c() {
20793        for text in [
20794            b"a".as_slice(),
20795            b"banana",
20796            b"mississippi",
20797            b"abracadabra",
20798            b"AAAAAAAAAAAAAAAA",
20799            b"zyxwvutsrqponmlk",
20800        ] {
20801            assert_libsais64_unbwt_matches_c(text);
20802        }
20803    }
20804
20805    #[test]
20806    fn public_libsais64_unbwt_aux_matches_upstream_c() {
20807        for text in [
20808            b"banana".as_slice(),
20809            b"mississippi",
20810            b"abracadabra",
20811            b"AAAAAAAAAAAAAAAA",
20812            b"zyxwvutsrqponmlk",
20813        ] {
20814            assert_libsais64_unbwt_aux_matches_c(text, 4);
20815        }
20816    }
20817
20818    #[test]
20819    fn public_libsais64_bwt_aux_round_trips() {
20820        for text in [
20821            b"banana".as_slice(),
20822            b"mississippi",
20823            b"abracadabra",
20824            b"AAAAAAAAAAAAAAAA",
20825            b"zyxwvutsrqponmlk",
20826        ] {
20827            assert_libsais64_bwt_aux_round_trips(text, 4);
20828        }
20829    }
20830}