Skip to main content

libsais_rs/
lib.rs

1//! Rust translation of upstream [libsais](https://github.com/IlyaGrebnov/libsais)
2//! 2.10.4 by Ilya Grebnov.
3//!
4//! This module exposes the 32-bit suffix array, BWT, unBWT, PLCP and LCP entry
5//! points (mirroring `libsais.h`). The 16-bit (`libsais16`), 64-bit
6//! (`libsais64`) and 16-bit/64-bit (`libsais16x64`) variants live in the
7//! sibling modules.
8
9use std::marker::PhantomData;
10use std::mem;
11
12use rayon::prelude::*;
13
14pub mod libsais16;
15pub mod libsais16x64;
16pub mod libsais64;
17pub use libsais16::{libsais16, SaSint as SaSint16, SaUint as SaUint16};
18pub use libsais16x64::{libsais16x64, SaSint as SaSint16x64, SaUint as SaUint16x64};
19pub use libsais64::{libsais64, SaSint as SaSint64, SaUint as SaUint64};
20
21pub type SaSint = i32;
22pub type SaUint = u32;
23pub type FastSint = isize;
24pub type FastUint = usize;
25
26pub const SAINT_BIT: u32 = 32;
27pub const SAINT_MAX: SaSint = i32::MAX;
28pub const SAINT_MIN: SaSint = i32::MIN;
29
30pub const ALPHABET_SIZE: usize = 1usize << 8;
31pub const UNBWT_FASTBITS: usize = 17;
32
33pub const SUFFIX_GROUP_BIT: u32 = SAINT_BIT - 1;
34pub const SUFFIX_GROUP_MARKER: SaSint = 1_i32 << (SUFFIX_GROUP_BIT - 1);
35
36pub const LIBSAIS_LOCAL_BUFFER_SIZE: usize = 2000;
37pub const LIBSAIS_PER_THREAD_CACHE_SIZE: usize = 24_576;
38
39pub const LIBSAIS_FLAGS_NONE: SaSint = 0;
40pub const LIBSAIS_FLAGS_BWT: SaSint = 1;
41pub const LIBSAIS_FLAGS_GSA: SaSint = 2;
42
43pub(crate) fn run_rayon_with_threads<R: Send>(_threads: usize, f: impl FnOnce() -> R + Send) -> R {
44    f()
45}
46
47#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
48pub struct ThreadCache {
49    pub symbol: SaSint,
50    pub index: SaSint,
51}
52
53#[derive(Clone, Debug, PartialEq, Eq)]
54pub struct ThreadState {
55    pub position: FastSint,
56    pub count: FastSint,
57    pub m: FastSint,
58    pub last_lms_suffix: FastSint,
59    pub buckets: Vec<SaSint>,
60    pub cache: Vec<ThreadCache>,
61}
62
63impl ThreadState {
64    fn new() -> Self {
65        Self {
66            position: 0,
67            count: 0,
68            m: 0,
69            last_lms_suffix: 0,
70            buckets: vec![0; 4 * ALPHABET_SIZE],
71            cache: vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE],
72        }
73    }
74}
75
76#[derive(Clone, Debug, PartialEq, Eq)]
77pub struct Context {
78    pub buckets: Vec<SaSint>,
79    pub thread_state: Option<Vec<ThreadState>>,
80    pub threads: FastSint,
81}
82
83#[derive(Clone, Debug, PartialEq, Eq)]
84pub struct UnbwtContext {
85    pub bucket2: Vec<SaUint>,
86    pub fastbits: Vec<u16>,
87    pub buckets: Option<Vec<SaUint>>,
88    pub threads: FastSint,
89}
90
91/// Internal helper: buckets index2.
92#[doc(hidden)]
93pub fn buckets_index2(c: FastUint, s: FastUint) -> FastUint {
94    (c << 1) + s
95}
96
97/// Internal helper: buckets index4.
98#[doc(hidden)]
99pub fn buckets_index4(c: FastUint, s: FastUint) -> FastUint {
100    (c << 2) + s
101}
102
103/// Internal helper: align up.
104#[doc(hidden)]
105pub fn align_up(value: usize, alignment: usize) -> usize {
106    debug_assert!(alignment.is_power_of_two());
107    (value + alignment - 1) & !(alignment - 1)
108}
109
110/// Internal helper: alloc thread state.
111#[doc(hidden)]
112pub fn alloc_thread_state(threads: SaSint) -> Option<Vec<ThreadState>> {
113    if threads <= 0 {
114        return None;
115    }
116
117    let len = usize::try_from(threads).ok()?;
118    Some((0..len).map(|_| ThreadState::new()).collect())
119}
120
121/// Internal helper: create ctx main.
122#[doc(hidden)]
123pub fn create_ctx_main(threads: SaSint) -> Option<Context> {
124    if threads <= 0 {
125        return None;
126    }
127
128    let thread_state = if threads > 1 {
129        Some(alloc_thread_state(threads)?)
130    } else {
131        None
132    };
133
134    Some(Context {
135        buckets: vec![0; 8 * ALPHABET_SIZE],
136        thread_state,
137        threads: threads as FastSint,
138    })
139}
140
141/// Creates the libsais context that allows reusing allocated memory with each libsais operation.
142///
143/// In multi-threaded environments, use one context per thread for parallel executions.
144///
145/// Returns the context, or `None` on allocation failure.
146pub fn create_ctx() -> Option<Context> {
147    create_ctx_main(1)
148}
149
150/// Destroys the libsais context and frees previously allocated memory.
151pub fn free_ctx(_ctx: Context) {}
152
153/// Internal helper: unbwt create ctx main.
154#[doc(hidden)]
155pub fn unbwt_create_ctx_main(threads: SaSint) -> Option<UnbwtContext> {
156    if threads <= 0 {
157        return None;
158    }
159
160    let buckets = if threads > 1 {
161        let len = usize::try_from(threads).ok()? * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE);
162        Some(vec![0; len])
163    } else {
164        None
165    };
166
167    Some(UnbwtContext {
168        bucket2: vec![0; ALPHABET_SIZE * ALPHABET_SIZE],
169        fastbits: vec![0; 1 + (1 << UNBWT_FASTBITS)],
170        buckets,
171        threads: threads as FastSint,
172    })
173}
174
175/// Internal helper: unbwt free ctx main.
176#[doc(hidden)]
177pub fn unbwt_free_ctx_main(_ctx: UnbwtContext) {}
178
179/// Creates the libsais reverse-BWT context that allows reusing allocated memory with each `libsais_unbwt_*` operation.
180///
181/// In multi-threaded environments, use one context per thread for parallel executions.
182///
183/// Returns the context, or `None` on allocation failure.
184pub fn unbwt_create_ctx() -> Option<UnbwtContext> {
185    unbwt_create_ctx_main(1)
186}
187
188/// Destroys the libsais reverse-BWT context and frees previously allocated memory.
189pub fn unbwt_free_ctx(_ctx: UnbwtContext) {}
190
191/// Internal helper: count negative marked suffixes.
192#[doc(hidden)]
193pub fn count_negative_marked_suffixes(
194    sa: &[SaSint],
195    block_start: FastSint,
196    block_size: FastSint,
197) -> SaSint {
198    block_slice(sa, block_start, block_size)
199        .iter()
200        .map(|&value| SaSint::from(value < 0))
201        .sum()
202}
203
204/// Internal helper: count zero marked suffixes.
205#[doc(hidden)]
206pub fn count_zero_marked_suffixes(
207    sa: &[SaSint],
208    block_start: FastSint,
209    block_size: FastSint,
210) -> SaSint {
211    block_slice(sa, block_start, block_size)
212        .iter()
213        .map(|&value| SaSint::from(value == 0))
214        .sum()
215}
216
217/// Internal helper: place cached suffixes.
218#[doc(hidden)]
219pub fn place_cached_suffixes(
220    sa: &mut [SaSint],
221    cache: &[ThreadCache],
222    block_start: FastSint,
223    block_size: FastSint,
224) {
225    let start = usize::try_from(block_start).expect("block_start must be non-negative");
226    let len = usize::try_from(block_size).expect("block_size must be non-negative");
227    let entries = if cache.len() >= start + len {
228        &cache[start..start + len]
229    } else {
230        &cache[..len]
231    };
232
233    for entry in entries {
234        let slot = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
235        sa[slot] = entry.index;
236    }
237}
238
239/// Internal helper: compact and place cached suffixes.
240#[doc(hidden)]
241pub fn compact_and_place_cached_suffixes(
242    sa: &mut [SaSint],
243    cache: &mut [ThreadCache],
244    block_start: FastSint,
245    block_size: FastSint,
246) {
247    let start = usize::try_from(block_start).expect("block_start must be non-negative");
248    let len = usize::try_from(block_size).expect("block_size must be non-negative");
249    let read_start = if cache.len() >= start + len { start } else { 0 };
250    let read_end = read_start + len;
251
252    let mut write = read_start;
253    for read in read_start..read_end {
254        let entry = cache[read];
255        if entry.symbol >= 0 {
256            cache[write] = entry;
257            write += 1;
258        }
259    }
260
261    place_cached_suffixes(sa, cache, block_start, (write - read_start) as FastSint);
262}
263
264/// Internal helper: flip suffix markers (OpenMP variant).
265#[doc(hidden)]
266pub fn flip_suffix_markers_omp(sa: &mut [SaSint], l: SaSint, threads: SaSint) {
267    let len = usize::try_from(l).expect("l must be non-negative");
268    let omp_num_threads = if threads > 1 && l >= 65_536 {
269        usize::try_from(threads).expect("threads must be non-negative")
270    } else {
271        1
272    };
273    if omp_num_threads > 1 {
274        let chunk_size = ((len / omp_num_threads) & !15usize).max(16);
275        run_rayon_with_threads(omp_num_threads, || {
276            sa[..len].par_chunks_mut(chunk_size).for_each(|chunk| {
277                for value in chunk {
278                    *value ^= SAINT_MIN;
279                }
280            });
281        });
282        return;
283    }
284
285    let omp_block_stride = (len / omp_num_threads) & !15usize;
286    for omp_thread_num in 0..omp_num_threads {
287        let omp_block_start = omp_thread_num * omp_block_stride;
288        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
289            omp_block_stride
290        } else {
291            len - omp_block_start
292        };
293        for value in &mut sa[omp_block_start..omp_block_start + omp_block_size] {
294            *value ^= SAINT_MIN;
295        }
296    }
297}
298
299/// Internal helper: gather lms suffixes 8u.
300#[doc(hidden)]
301pub fn gather_lms_suffixes_8u(
302    t: &[u8],
303    sa: &mut [SaSint],
304    n: SaSint,
305    mut m: FastSint,
306    omp_block_start: FastSint,
307    omp_block_size: FastSint,
308) {
309    if omp_block_size <= 0 {
310        return;
311    }
312
313    let n = usize::try_from(n).expect("n must be non-negative");
314    let block_start =
315        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
316    let block_size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
317
318    let mut j = block_start + block_size;
319    let mut c0 = t[block_start + block_size - 1] as FastSint;
320    let mut c1 = -1;
321    while j < n {
322        c1 = t[j] as FastSint;
323        if c1 != c0 {
324            break;
325        }
326        j += 1;
327    }
328
329    let mut f0 = usize::from(c0 >= c1);
330    let mut f1: usize;
331    let mut i = block_start + block_size - 2;
332    let limit = block_start + 3;
333
334    while i >= limit {
335        c1 = t[i] as FastSint;
336        f1 = usize::from(c1 > (c0 - f0 as FastSint));
337        sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
338        m -= (f1 & !f0) as FastSint;
339
340        c0 = t[i - 1] as FastSint;
341        f0 = usize::from(c0 > (c1 - f1 as FastSint));
342        sa[usize::try_from(m).expect("m must be non-negative")] = i as SaSint;
343        m -= (f0 & !f1) as FastSint;
344
345        c1 = t[i - 2] as FastSint;
346        f1 = usize::from(c1 > (c0 - f0 as FastSint));
347        sa[usize::try_from(m).expect("m must be non-negative")] = (i - 1) as SaSint;
348        m -= (f1 & !f0) as FastSint;
349
350        c0 = t[i - 3] as FastSint;
351        f0 = usize::from(c0 > (c1 - f1 as FastSint));
352        sa[usize::try_from(m).expect("m must be non-negative")] = (i - 2) as SaSint;
353        m -= (f0 & !f1) as FastSint;
354
355        if i < 4 {
356            break;
357        }
358        i -= 4;
359    }
360
361    let tail_limit = limit - 3;
362    while i >= tail_limit {
363        c1 = c0;
364        c0 = t[i] as FastSint;
365        f1 = f0;
366        f0 = usize::from(c0 > (c1 - f1 as FastSint));
367        sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
368        m -= (f0 & !f1) as FastSint;
369        if i == 0 {
370            break;
371        }
372        i -= 1;
373    }
374
375    sa[usize::try_from(m).expect("m must be non-negative")] = (i + 1) as SaSint;
376}
377
378/// Internal helper: gather lms suffixes 8u (OpenMP variant).
379#[doc(hidden)]
380pub fn gather_lms_suffixes_8u_omp(
381    t: &[u8],
382    sa: &mut [SaSint],
383    n: SaSint,
384    threads: SaSint,
385    thread_state: &mut [ThreadState],
386) {
387    let n_usize = usize::try_from(n).expect("n must be non-negative");
388    let omp_num_threads = if threads > 1 && n >= 65_536 {
389        usize::try_from(threads)
390            .expect("threads must be non-negative")
391            .min(thread_state.len())
392            .max(1)
393    } else {
394        1
395    };
396    if omp_num_threads == 1 {
397        gather_lms_suffixes_8u(t, sa, n, n as FastSint - 1, 0, n as FastSint);
398        return;
399    }
400
401    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
402    let mut suffix_counts_after = vec![0 as FastSint; omp_num_threads];
403    let mut m = 0 as FastSint;
404    for omp_thread_num in (0..omp_num_threads).rev() {
405        suffix_counts_after[omp_thread_num] = m;
406        m += thread_state[omp_thread_num].m;
407    }
408
409    for omp_thread_num in 0..omp_num_threads {
410        let omp_block_start = omp_thread_num * omp_block_stride;
411        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
412            omp_block_stride
413        } else {
414            n_usize - omp_block_start
415        };
416        gather_lms_suffixes_8u(
417            t,
418            sa,
419            n,
420            n as FastSint - 1 - suffix_counts_after[omp_thread_num],
421            omp_block_start as FastSint,
422            omp_block_size as FastSint,
423        );
424    }
425
426    for omp_thread_num in 0..omp_num_threads {
427        if thread_state[omp_thread_num].m > 0 {
428            let dst = usize::try_from(n as FastSint - 1 - suffix_counts_after[omp_thread_num])
429                .expect("destination must be non-negative");
430            sa[dst] = thread_state[omp_thread_num].last_lms_suffix as SaSint;
431        }
432    }
433}
434
435/// Internal helper: gather lms suffixes 32s.
436#[doc(hidden)]
437pub fn gather_lms_suffixes_32s(t: &[SaSint], sa: &mut [SaSint], n: SaSint) -> SaSint {
438    let n_usize = usize::try_from(n).expect("n must be non-negative");
439    let mut i = n as FastSint - 2;
440    let mut m = n_usize - 1;
441    let mut f0 = 1usize;
442    let mut f1: usize;
443    let mut c0 = t[n_usize - 1] as FastSint;
444    let mut c1: FastSint;
445
446    while i >= 3 {
447        c1 = t[i as usize] as FastSint;
448        f1 = usize::from(c1 > (c0 - f0 as FastSint));
449        sa[m] = (i + 1) as SaSint;
450        m -= f1 & !f0;
451
452        c0 = t[(i - 1) as usize] as FastSint;
453        f0 = usize::from(c0 > (c1 - f1 as FastSint));
454        sa[m] = i as SaSint;
455        m -= f0 & !f1;
456
457        c1 = t[(i - 2) as usize] as FastSint;
458        f1 = usize::from(c1 > (c0 - f0 as FastSint));
459        sa[m] = (i - 1) as SaSint;
460        m -= f1 & !f0;
461
462        c0 = t[(i - 3) as usize] as FastSint;
463        f0 = usize::from(c0 > (c1 - f1 as FastSint));
464        sa[m] = (i - 2) as SaSint;
465        m -= f0 & !f1;
466
467        i -= 4;
468    }
469
470    while i >= 0 {
471        c1 = c0;
472        c0 = t[i as usize] as FastSint;
473        f1 = f0;
474        f0 = usize::from(c0 > (c1 - f1 as FastSint));
475        sa[m] = (i + 1) as SaSint;
476        m -= f0 & !f1;
477        i -= 1;
478    }
479
480    (n_usize - 1 - m) as SaSint
481}
482
483/// Internal helper: gather compacted lms suffixes 32s.
484#[doc(hidden)]
485pub fn gather_compacted_lms_suffixes_32s(t: &[SaSint], sa: &mut [SaSint], n: SaSint) -> SaSint {
486    let n_usize = usize::try_from(n).expect("n must be non-negative");
487    let mut i = n as FastSint - 2;
488    let mut m = n_usize - 1;
489    let mut f0 = 1usize;
490    let mut f1: usize;
491    let mut c0 = t[n_usize - 1] as FastSint;
492    let mut c1: FastSint;
493
494    while i >= 3 {
495        c1 = t[i as usize] as FastSint;
496        f1 = usize::from(c1 > (c0 - f0 as FastSint));
497        sa[m] = (i + 1) as SaSint;
498        m -= f1 & !f0 & usize::from(c0 >= 0);
499
500        c0 = t[(i - 1) as usize] as FastSint;
501        f0 = usize::from(c0 > (c1 - f1 as FastSint));
502        sa[m] = i as SaSint;
503        m -= f0 & !f1 & usize::from(c1 >= 0);
504
505        c1 = t[(i - 2) as usize] as FastSint;
506        f1 = usize::from(c1 > (c0 - f0 as FastSint));
507        sa[m] = (i - 1) as SaSint;
508        m -= f1 & !f0 & usize::from(c0 >= 0);
509
510        c0 = t[(i - 3) as usize] as FastSint;
511        f0 = usize::from(c0 > (c1 - f1 as FastSint));
512        sa[m] = (i - 2) as SaSint;
513        m -= f0 & !f1 & usize::from(c1 >= 0);
514
515        i -= 4;
516    }
517
518    while i >= 0 {
519        c1 = c0;
520        c0 = t[i as usize] as FastSint;
521        f1 = f0;
522        f0 = usize::from(c0 > (c1 - f1 as FastSint));
523        sa[m] = (i + 1) as SaSint;
524        m -= f0 & !f1 & usize::from(c1 >= 0);
525        i -= 1;
526    }
527
528    (n_usize - 1 - m) as SaSint
529}
530
531/// Internal helper: count lms suffixes 32s 4k.
532#[doc(hidden)]
533pub fn count_lms_suffixes_32s_4k(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
534    buckets.fill(0);
535    let n_usize = usize::try_from(n).expect("n must be non-negative");
536    let _k_usize = usize::try_from(k).expect("k must be non-negative");
537    let mut i = n as FastSint - 2;
538    let mut f0 = 1usize;
539    let mut f1: usize;
540    let mut c0 = t[n_usize - 1] as FastSint;
541    let mut c1: FastSint;
542
543    while i >= 3 {
544        c1 = t[i as usize] as FastSint;
545        f1 = usize::from(c1 > (c0 - f0 as FastSint));
546        buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
547
548        c0 = t[(i - 1) as usize] as FastSint;
549        f0 = usize::from(c0 > (c1 - f1 as FastSint));
550        buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
551
552        c1 = t[(i - 2) as usize] as FastSint;
553        f1 = usize::from(c1 > (c0 - f0 as FastSint));
554        buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
555
556        c0 = t[(i - 3) as usize] as FastSint;
557        f0 = usize::from(c0 > (c1 - f1 as FastSint));
558        buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
559
560        i -= 4;
561    }
562
563    while i >= 0 {
564        c1 = c0;
565        c0 = t[i as usize] as FastSint;
566        f1 = f0;
567        f0 = usize::from(c0 > (c1 - f1 as FastSint));
568        buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
569        i -= 1;
570    }
571
572    buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0)] += 1;
573}
574
575/// Internal helper: count lms suffixes 32s 2k.
576#[doc(hidden)]
577pub fn count_lms_suffixes_32s_2k(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
578    buckets.fill(0);
579    let n_usize = usize::try_from(n).expect("n must be non-negative");
580    let _k_usize = usize::try_from(k).expect("k must be non-negative");
581    let mut i = n as FastSint - 2;
582    let mut f0 = 1usize;
583    let mut f1: usize;
584    let mut c0 = t[n_usize - 1] as FastSint;
585    let mut c1: FastSint;
586
587    while i >= 3 {
588        c1 = t[i as usize] as FastSint;
589        f1 = usize::from(c1 > (c0 - f0 as FastSint));
590        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
591
592        c0 = t[(i - 1) as usize] as FastSint;
593        f0 = usize::from(c0 > (c1 - f1 as FastSint));
594        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
595
596        c1 = t[(i - 2) as usize] as FastSint;
597        f1 = usize::from(c1 > (c0 - f0 as FastSint));
598        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
599
600        c0 = t[(i - 3) as usize] as FastSint;
601        f0 = usize::from(c0 > (c1 - f1 as FastSint));
602        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
603
604        i -= 4;
605    }
606
607    while i >= 0 {
608        c1 = c0;
609        c0 = t[i as usize] as FastSint;
610        f1 = f0;
611        f0 = usize::from(c0 > (c1 - f1 as FastSint));
612        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
613        i -= 1;
614    }
615
616    buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, 0)] += 1;
617}
618
619/// Internal helper: count compacted lms suffixes 32s 2k.
620#[doc(hidden)]
621pub fn count_compacted_lms_suffixes_32s_2k(
622    t: &[SaSint],
623    n: SaSint,
624    k: SaSint,
625    buckets: &mut [SaSint],
626) {
627    buckets.fill(0);
628    let n_usize = usize::try_from(n).expect("n must be non-negative");
629    let _k_usize = usize::try_from(k).expect("k must be non-negative");
630    let mut i = n as FastSint - 2;
631    let mut f0 = 1usize;
632    let mut f1: usize;
633    let mut c0 = t[n_usize - 1] as FastSint;
634    let mut c1: FastSint;
635
636    while i >= 3 {
637        c1 = t[i as usize] as FastSint;
638        f1 = usize::from(c1 > (c0 - f0 as FastSint));
639        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
640
641        c0 = t[(i - 1) as usize] as FastSint;
642        f0 = usize::from(c0 > (c1 - f1 as FastSint));
643        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
644
645        c1 = t[(i - 2) as usize] as FastSint;
646        f1 = usize::from(c1 > (c0 - f0 as FastSint));
647        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
648
649        c0 = t[(i - 3) as usize] as FastSint;
650        f0 = usize::from(c0 > (c1 - f1 as FastSint));
651        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
652
653        i -= 4;
654    }
655
656    while i >= 0 {
657        c1 = c0;
658        c0 = t[i as usize] as FastSint;
659        f1 = f0;
660        f0 = usize::from(c0 > (c1 - f1 as FastSint));
661        buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
662        i -= 1;
663    }
664
665    buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, 0)] += 1;
666}
667
668/// Internal helper: count and gather lms suffixes 8u.
669#[doc(hidden)]
670pub fn count_and_gather_lms_suffixes_8u(
671    t: &[u8],
672    sa: &mut [SaSint],
673    n: SaSint,
674    buckets: &mut [SaSint],
675    omp_block_start: FastSint,
676    omp_block_size: FastSint,
677) -> SaSint {
678    buckets.fill(0);
679    let n = n as FastSint;
680    let mut m = omp_block_start + omp_block_size - 1;
681
682    if omp_block_size > 0 {
683        let prefetch_distance = 256 as FastSint;
684        let mut j = m + 1;
685        let mut c0 = t[m as usize] as FastSint;
686        let mut c1 = -1;
687        while j < n {
688            c1 = t[j as usize] as FastSint;
689            if c1 != c0 {
690                break;
691            }
692            j += 1;
693        }
694
695        let mut f0 = usize::from(c0 >= c1);
696        let mut f1: usize;
697        let mut i = m - 1;
698        let limit = omp_block_start + 3;
699
700        while i >= limit {
701            let _prefetch_index = i - prefetch_distance;
702            c1 = t[i as usize] as FastSint;
703            f1 = usize::from(c1 > (c0 - f0 as FastSint));
704            sa[m as usize] = (i + 1) as SaSint;
705            m -= (f1 & !f0) as FastSint;
706            buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
707
708            c0 = t[(i - 1) as usize] as FastSint;
709            f0 = usize::from(c0 > (c1 - f1 as FastSint));
710            sa[m as usize] = i as SaSint;
711            m -= (f0 & !f1) as FastSint;
712            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
713
714            c1 = t[(i - 2) as usize] as FastSint;
715            f1 = usize::from(c1 > (c0 - f0 as FastSint));
716            sa[m as usize] = (i - 1) as SaSint;
717            m -= (f1 & !f0) as FastSint;
718            buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
719
720            c0 = t[(i - 3) as usize] as FastSint;
721            f0 = usize::from(c0 > (c1 - f1 as FastSint));
722            sa[m as usize] = (i - 2) as SaSint;
723            m -= (f0 & !f1) as FastSint;
724            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
725
726            i -= 4;
727        }
728
729        let tail_limit = limit - 3;
730        while i >= tail_limit {
731            c1 = c0;
732            c0 = t[i as usize] as FastSint;
733            f1 = f0;
734            f0 = usize::from(c0 > (c1 - f1 as FastSint));
735            sa[m as usize] = (i + 1) as SaSint;
736            m -= (f0 & !f1) as FastSint;
737            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
738            i -= 1;
739        }
740
741        c1 = if i >= 0 {
742            t[i as usize] as FastSint
743        } else {
744            -1
745        };
746        f1 = usize::from(c1 > (c0 - f0 as FastSint));
747        sa[m as usize] = (i + 1) as SaSint;
748        m -= (f1 & !f0) as FastSint;
749        buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
750    }
751
752    (omp_block_start + omp_block_size - 1 - m) as SaSint
753}
754
755/// Internal helper: count and gather lms suffixes 8u (OpenMP variant).
756#[doc(hidden)]
757pub fn count_and_gather_lms_suffixes_8u_omp(
758    t: &[u8],
759    sa: &mut [SaSint],
760    n: SaSint,
761    buckets: &mut [SaSint],
762    threads: SaSint,
763    thread_state: &mut [ThreadState],
764) -> SaSint {
765    let mut m = 0;
766    let n_usize = usize::try_from(n).expect("n must be non-negative");
767    let omp_num_threads = if threads > 1 && n >= 65_536 {
768        usize::try_from(threads)
769            .expect("threads must be non-negative")
770            .min(thread_state.len())
771            .max(1)
772    } else {
773        1
774    };
775    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
776
777    if omp_num_threads == 1 {
778        return count_and_gather_lms_suffixes_8u(t, sa, n, buckets, 0, n as FastSint);
779    }
780
781    for omp_thread_num in 0..omp_num_threads {
782        let omp_block_start = omp_thread_num * omp_block_stride;
783        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
784            omp_block_stride
785        } else {
786            n_usize - omp_block_start
787        };
788
789        let state = &mut thread_state[omp_thread_num];
790        state.position = FastSint::try_from(omp_block_start + omp_block_size)
791            .expect("position must fit FastSint");
792        state.m = FastSint::try_from(count_and_gather_lms_suffixes_8u(
793            t,
794            sa,
795            n,
796            &mut state.buckets,
797            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
798            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
799        ))
800        .expect("m must fit FastSint");
801
802        if state.m > 0 {
803            let position = usize::try_from(state.position).expect("position must be non-negative");
804            state.last_lms_suffix =
805                FastSint::try_from(sa[position - 1]).expect("suffix must fit FastSint");
806        }
807    }
808
809    buckets.fill(0);
810
811    for tnum in (0..omp_num_threads).rev() {
812        let state = &mut thread_state[tnum];
813        m += SaSint::try_from(state.m).expect("m must fit SaSint");
814
815        if tnum + 1 < omp_num_threads && state.m > 0 {
816            let position = usize::try_from(state.position).expect("position must be non-negative");
817            let count = usize::try_from(state.m).expect("m must be non-negative");
818            let dst = n_usize - usize::try_from(m).expect("m must be non-negative");
819            sa.copy_within(position - count..position, dst);
820        }
821
822        for s in 0..4 * ALPHABET_SIZE {
823            let a = buckets[s];
824            let b = state.buckets[s];
825            buckets[s] = a + b;
826            state.buckets[s] = a;
827        }
828    }
829
830    m
831}
832
833/// Internal helper: count and gather lms suffixes 32s 4k.
834#[doc(hidden)]
835pub fn count_and_gather_lms_suffixes_32s_4k(
836    t: &[SaSint],
837    sa: &mut [SaSint],
838    n: SaSint,
839    k: SaSint,
840    buckets: &mut [SaSint],
841    omp_block_start: FastSint,
842    omp_block_size: FastSint,
843) -> SaSint {
844    buckets.fill(0);
845    let n = n as FastSint;
846    let _k = k as FastSint;
847    let mut m = omp_block_start + omp_block_size - 1;
848
849    if omp_block_size > 0 {
850        let prefetch_distance = 64 as FastSint;
851        let mut j = m + 1;
852        let mut c0 = t[m as usize] as FastSint;
853        let mut c1 = -1;
854
855        while j < n {
856            c1 = t[j as usize] as FastSint;
857            if c1 != c0 {
858                break;
859            }
860            j += 1;
861        }
862
863        let mut f0 = usize::from(c0 >= c1);
864        let mut f1: usize;
865        let mut i = m - 1;
866        let limit = omp_block_start + prefetch_distance + 3;
867
868        while i >= limit {
869            let _prefetch_index = i - 2 * prefetch_distance;
870            c1 = t[i as usize] as FastSint;
871            f1 = usize::from(c1 > (c0 - f0 as FastSint));
872            sa[m as usize] = (i + 1) as SaSint;
873            m -= (f1 & !f0) as FastSint;
874            buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
875
876            c0 = t[(i - 1) as usize] as FastSint;
877            f0 = usize::from(c0 > (c1 - f1 as FastSint));
878            sa[m as usize] = i as SaSint;
879            m -= (f0 & !f1) as FastSint;
880            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
881
882            c1 = t[(i - 2) as usize] as FastSint;
883            f1 = usize::from(c1 > (c0 - f0 as FastSint));
884            sa[m as usize] = (i - 1) as SaSint;
885            m -= (f1 & !f0) as FastSint;
886            buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
887
888            c0 = t[(i - 3) as usize] as FastSint;
889            f0 = usize::from(c0 > (c1 - f1 as FastSint));
890            sa[m as usize] = (i - 2) as SaSint;
891            m -= (f0 & !f1) as FastSint;
892            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
893
894            i -= 4;
895        }
896
897        let tail_limit = omp_block_start;
898        while i >= tail_limit {
899            c1 = c0;
900            c0 = t[i as usize] as FastSint;
901            f1 = f0;
902            f0 = usize::from(c0 > (c1 - f1 as FastSint));
903            sa[m as usize] = (i + 1) as SaSint;
904            m -= (f0 & !f1) as FastSint;
905            buckets[buckets_index4((c1 as SaSint & SAINT_MAX) as usize, f1 + f1 + f0)] += 1;
906            i -= 1;
907        }
908
909        c1 = if i >= 0 {
910            t[i as usize] as FastSint
911        } else {
912            -1
913        };
914        f1 = usize::from(c1 > (c0 - f0 as FastSint));
915        sa[m as usize] = (i + 1) as SaSint;
916        m -= (f1 & !f0) as FastSint;
917        buckets[buckets_index4((c0 as SaSint & SAINT_MAX) as usize, f0 + f0 + f1)] += 1;
918    }
919
920    (omp_block_start + omp_block_size - 1 - m) as SaSint
921}
922
923/// Internal helper: count and gather lms suffixes 32s 2k.
924#[doc(hidden)]
925pub fn count_and_gather_lms_suffixes_32s_2k(
926    t: &[SaSint],
927    sa: &mut [SaSint],
928    n: SaSint,
929    k: SaSint,
930    buckets: &mut [SaSint],
931    omp_block_start: FastSint,
932    omp_block_size: FastSint,
933) -> SaSint {
934    buckets.fill(0);
935    let n = n as FastSint;
936    let _k = k as FastSint;
937    let mut m = omp_block_start + omp_block_size - 1;
938
939    if omp_block_size > 0 {
940        let prefetch_distance = 64 as FastSint;
941        let mut j = m + 1;
942        let mut c0 = t[m as usize] as FastSint;
943        let mut c1 = -1;
944
945        while j < n {
946            c1 = t[j as usize] as FastSint;
947            if c1 != c0 {
948                break;
949            }
950            j += 1;
951        }
952
953        let mut f0 = usize::from(c0 >= c1);
954        let mut f1: usize;
955        let mut i = m - 1;
956        let limit = omp_block_start + prefetch_distance + 3;
957
958        while i >= limit {
959            let _prefetch_index = i - 2 * prefetch_distance;
960            c1 = t[i as usize] as FastSint;
961            f1 = usize::from(c1 > (c0 - f0 as FastSint));
962            sa[m as usize] = (i + 1) as SaSint;
963            m -= (f1 & !f0) as FastSint;
964            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
965
966            c0 = t[(i - 1) as usize] as FastSint;
967            f0 = usize::from(c0 > (c1 - f1 as FastSint));
968            sa[m as usize] = i as SaSint;
969            m -= (f0 & !f1) as FastSint;
970            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
971
972            c1 = t[(i - 2) as usize] as FastSint;
973            f1 = usize::from(c1 > (c0 - f0 as FastSint));
974            sa[m as usize] = (i - 1) as SaSint;
975            m -= (f1 & !f0) as FastSint;
976            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
977
978            c0 = t[(i - 3) as usize] as FastSint;
979            f0 = usize::from(c0 > (c1 - f1 as FastSint));
980            sa[m as usize] = (i - 2) as SaSint;
981            m -= (f0 & !f1) as FastSint;
982            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
983
984            i -= 4;
985        }
986
987        let tail_limit = omp_block_start;
988        while i >= tail_limit {
989            c1 = c0;
990            c0 = t[i as usize] as FastSint;
991            f1 = f0;
992            f0 = usize::from(c0 > (c1 - f1 as FastSint));
993            sa[m as usize] = (i + 1) as SaSint;
994            m -= (f0 & !f1) as FastSint;
995            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
996            i -= 1;
997        }
998
999        c1 = if i >= 0 {
1000            t[i as usize] as FastSint
1001        } else {
1002            -1
1003        };
1004        f1 = usize::from(c1 > (c0 - f0 as FastSint));
1005        sa[m as usize] = (i + 1) as SaSint;
1006        m -= (f1 & !f0) as FastSint;
1007        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1008    }
1009
1010    (omp_block_start + omp_block_size - 1 - m) as SaSint
1011}
1012
1013/// Internal helper: count and gather compacted lms suffixes 32s 2k.
1014#[doc(hidden)]
1015pub fn count_and_gather_compacted_lms_suffixes_32s_2k(
1016    t: &[SaSint],
1017    sa: &mut [SaSint],
1018    n: SaSint,
1019    k: SaSint,
1020    buckets: &mut [SaSint],
1021    omp_block_start: FastSint,
1022    omp_block_size: FastSint,
1023) -> SaSint {
1024    buckets.fill(0);
1025    let n_usize = usize::try_from(n).expect("n must be non-negative");
1026    let _k_usize = usize::try_from(k).expect("k must be non-negative");
1027    let block_start =
1028        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
1029    let block_size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
1030    let mut m = block_start + block_size - 1;
1031
1032    if omp_block_size > 0 {
1033        let mut j = m + 1;
1034        let mut c0 = t[m] as FastSint;
1035        let mut c1 = -1;
1036
1037        while j < n_usize {
1038            c1 = t[j] as FastSint;
1039            if c1 != c0 {
1040                break;
1041            }
1042            j += 1;
1043        }
1044
1045        let mut f0 = usize::from(c0 >= c1);
1046        let mut f1: usize;
1047        let mut i = m as FastSint - 1;
1048        let limit = block_start as FastSint + 3;
1049
1050        while i >= limit {
1051            c1 = t[i as usize] as FastSint;
1052            f1 = usize::from(c1 > (c0 - f0 as FastSint));
1053            sa[m] = (i + 1) as SaSint;
1054            m -= f1 & !f0 & usize::from(c0 >= 0);
1055            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1056
1057            c0 = t[(i - 1) as usize] as FastSint;
1058            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1059            sa[m] = i as SaSint;
1060            m -= f0 & !f1 & usize::from(c1 >= 0);
1061            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1062
1063            c1 = t[(i - 2) as usize] as FastSint;
1064            f1 = usize::from(c1 > (c0 - f0 as FastSint));
1065            sa[m] = (i - 1) as SaSint;
1066            m -= f1 & !f0 & usize::from(c0 >= 0);
1067            buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1068
1069            c0 = t[(i - 3) as usize] as FastSint;
1070            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1071            sa[m] = (i - 2) as SaSint;
1072            m -= f0 & !f1 & usize::from(c1 >= 0);
1073            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1074
1075            i -= 4;
1076        }
1077
1078        let tail_limit = block_start as FastSint;
1079        while i >= tail_limit {
1080            c1 = c0;
1081            c0 = t[i as usize] as FastSint;
1082            f1 = f0;
1083            f0 = usize::from(c0 > (c1 - f1 as FastSint));
1084            sa[m] = (i + 1) as SaSint;
1085            m -= f0 & !f1 & usize::from(c1 >= 0);
1086            buckets[buckets_index2((c1 as SaSint & SAINT_MAX) as usize, f0 & !f1)] += 1;
1087            i -= 1;
1088        }
1089
1090        c1 = if i >= 0 {
1091            t[i as usize] as FastSint
1092        } else {
1093            -1
1094        };
1095        f1 = usize::from(c1 > (c0 - f0 as FastSint));
1096        sa[m] = (i + 1) as SaSint;
1097        m -= f1 & !f0 & usize::from(c0 >= 0);
1098        buckets[buckets_index2((c0 as SaSint & SAINT_MAX) as usize, f1 & !f0)] += 1;
1099    }
1100
1101    (block_start + block_size - 1 - m) as SaSint
1102}
1103
1104/// Internal helper: get bucket stride.
1105#[doc(hidden)]
1106pub fn get_bucket_stride(
1107    free_space: FastSint,
1108    bucket_size: FastSint,
1109    num_buckets: FastSint,
1110) -> FastSint {
1111    let bucket_size_1024 = (bucket_size + 1023) & (-1024);
1112    if free_space / (num_buckets - 1) >= bucket_size_1024 {
1113        return bucket_size_1024;
1114    }
1115    let bucket_size_16 = (bucket_size + 15) & (-16);
1116    if free_space / (num_buckets - 1) >= bucket_size_16 {
1117        return bucket_size_16;
1118    }
1119    bucket_size
1120}
1121
1122/// Internal helper: count and gather lms suffixes 32s 4k nofs (OpenMP variant).
1123#[doc(hidden)]
1124pub fn count_and_gather_lms_suffixes_32s_4k_nofs_omp(
1125    t: &[SaSint],
1126    sa: &mut [SaSint],
1127    n: SaSint,
1128    k: SaSint,
1129    buckets: &mut [SaSint],
1130    threads: SaSint,
1131) -> SaSint {
1132    let m;
1133    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1134
1135    if omp_num_threads == 1 {
1136        m = count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint);
1137    } else {
1138        count_lms_suffixes_32s_4k(t, n, k, buckets);
1139        m = gather_lms_suffixes_32s(t, sa, n);
1140    }
1141
1142    m
1143}
1144
1145/// Internal helper: count and gather lms suffixes 32s 2k nofs (OpenMP variant).
1146#[doc(hidden)]
1147pub fn count_and_gather_lms_suffixes_32s_2k_nofs_omp(
1148    t: &[SaSint],
1149    sa: &mut [SaSint],
1150    n: SaSint,
1151    k: SaSint,
1152    buckets: &mut [SaSint],
1153    threads: SaSint,
1154) -> SaSint {
1155    let m;
1156    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1157
1158    if omp_num_threads == 1 {
1159        m = count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1160    } else {
1161        count_lms_suffixes_32s_2k(t, n, k, buckets);
1162        m = gather_lms_suffixes_32s(t, sa, n);
1163    }
1164
1165    m
1166}
1167
1168/// Internal helper: count and gather compacted lms suffixes 32s 2k nofs (OpenMP variant).
1169#[doc(hidden)]
1170pub fn count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
1171    t: &[SaSint],
1172    sa: &mut [SaSint],
1173    n: SaSint,
1174    k: SaSint,
1175    buckets: &mut [SaSint],
1176    threads: SaSint,
1177) -> SaSint {
1178    let m;
1179    let omp_num_threads = if threads > 1 && n >= 65_536 { 2 } else { 1 };
1180
1181    if omp_num_threads == 1 {
1182        m = count_and_gather_compacted_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1183    } else {
1184        count_compacted_lms_suffixes_32s_2k(t, n, k, buckets);
1185        m = gather_compacted_lms_suffixes_32s(t, sa, n);
1186    }
1187
1188    m
1189}
1190
1191/// Internal helper: count and gather lms suffixes 32s 4k fs (OpenMP variant).
1192#[doc(hidden)]
1193pub fn count_and_gather_lms_suffixes_32s_4k_fs_omp(
1194    t: &[SaSint],
1195    sa: &mut [SaSint],
1196    n: SaSint,
1197    k: SaSint,
1198    buckets: &mut [SaSint],
1199    local_buckets: SaSint,
1200    threads: SaSint,
1201    thread_state: &mut [ThreadState],
1202) -> SaSint {
1203    let n_usize = usize::try_from(n).expect("n must be non-negative");
1204    let k_usize = usize::try_from(k).expect("k must be non-negative");
1205    let omp_num_threads = usize::try_from(threads).expect("threads must be non-negative");
1206    let bucket_size = FastSint::try_from(4 * k_usize).expect("bucket size must fit FastSint");
1207
1208    if omp_num_threads <= 1 || n < 65_536 {
1209        return count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint);
1210    }
1211
1212    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
1213    let free_space = if local_buckets == 1 {
1214        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1215    } else if local_buckets > 1 {
1216        FastSint::try_from(local_buckets).expect("free space must fit FastSint")
1217    } else {
1218        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1219    };
1220    let bucket_stride = get_bucket_stride(
1221        free_space,
1222        bucket_size,
1223        FastSint::try_from(omp_num_threads).expect("thread count must fit FastSint"),
1224    );
1225    let bucket_size_usize = usize::try_from(bucket_size).expect("bucket size must be non-negative");
1226    let bucket_stride_usize =
1227        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1228    let workspace_len =
1229        bucket_size_usize + bucket_stride_usize.saturating_mul(omp_num_threads.saturating_sub(1));
1230    let mut workspace = vec![0; workspace_len];
1231
1232    for omp_thread_num in 0..omp_num_threads {
1233        let omp_block_start = omp_thread_num * omp_block_stride;
1234        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1235            omp_block_stride
1236        } else {
1237            n_usize - omp_block_start
1238        };
1239        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1240        let workspace_start = workspace_end - bucket_size_usize;
1241        let count = count_and_gather_lms_suffixes_32s_4k(
1242            t,
1243            sa,
1244            n,
1245            k,
1246            &mut workspace[workspace_start..workspace_end],
1247            omp_block_start as FastSint,
1248            omp_block_size as FastSint,
1249        );
1250
1251        thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1252        thread_state[omp_thread_num].count = count as FastSint;
1253    }
1254
1255    let mut m = 0;
1256    for t in (0..omp_num_threads).rev() {
1257        m += thread_state[t].count as SaSint;
1258
1259        if t + 1 != omp_num_threads && thread_state[t].count > 0 {
1260            let src_end =
1261                usize::try_from(thread_state[t].position).expect("position must be non-negative");
1262            let src_start = src_end
1263                - usize::try_from(thread_state[t].count).expect("count must be non-negative");
1264            let dst_start = usize::try_from(n - m).expect("destination must be non-negative");
1265            sa.copy_within(src_start..src_end, dst_start);
1266        }
1267    }
1268
1269    let omp_num_threads = omp_num_threads - 1;
1270    let omp_block_stride = (bucket_size_usize / omp_num_threads) & !15usize;
1271    for omp_thread_num in 0..omp_num_threads {
1272        let omp_block_start = omp_thread_num * omp_block_stride;
1273        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1274            omp_block_stride
1275        } else {
1276            bucket_size_usize - omp_block_start
1277        };
1278        accumulate_counts_s32(
1279            &mut workspace[omp_block_start..],
1280            omp_block_size as FastSint,
1281            bucket_stride,
1282            FastSint::try_from(omp_num_threads + 1).expect("thread count must fit FastSint"),
1283        );
1284    }
1285
1286    let accumulated_start = omp_num_threads * bucket_stride_usize;
1287    buckets[..bucket_size_usize]
1288        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size_usize]);
1289    m
1290}
1291
1292/// Internal helper: count and gather lms suffixes 32s 2k fs (OpenMP variant).
1293#[doc(hidden)]
1294pub fn count_and_gather_lms_suffixes_32s_2k_fs_omp(
1295    t: &[SaSint],
1296    sa: &mut [SaSint],
1297    n: SaSint,
1298    k: SaSint,
1299    buckets: &mut [SaSint],
1300    local_buckets: SaSint,
1301    threads: SaSint,
1302    thread_state: &mut [ThreadState],
1303) -> SaSint {
1304    let n_usize = usize::try_from(n).expect("n must be non-negative");
1305    let k_usize = usize::try_from(k).expect("k must be non-negative");
1306    let omp_num_threads = usize::try_from(threads).expect("threads must be non-negative");
1307    let bucket_size = FastSint::try_from(2 * k_usize).expect("bucket size must fit FastSint");
1308
1309    if omp_num_threads <= 1 || n < 65_536 {
1310        return count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1311    }
1312
1313    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
1314    let free_space = if local_buckets == 1 {
1315        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1316    } else if local_buckets > 1 {
1317        FastSint::try_from(local_buckets).expect("free space must fit FastSint")
1318    } else {
1319        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1320    };
1321    let bucket_stride = get_bucket_stride(
1322        free_space,
1323        bucket_size,
1324        FastSint::try_from(omp_num_threads).expect("thread count must fit FastSint"),
1325    );
1326    let bucket_size_usize = usize::try_from(bucket_size).expect("bucket size must be non-negative");
1327    let bucket_stride_usize =
1328        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1329    let workspace_len =
1330        bucket_size_usize + bucket_stride_usize.saturating_mul(omp_num_threads.saturating_sub(1));
1331    let mut workspace = vec![0; workspace_len];
1332
1333    for omp_thread_num in 0..omp_num_threads {
1334        let omp_block_start = omp_thread_num * omp_block_stride;
1335        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1336            omp_block_stride
1337        } else {
1338            n_usize - omp_block_start
1339        };
1340        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1341        let workspace_start = workspace_end - bucket_size_usize;
1342        let count = count_and_gather_lms_suffixes_32s_2k(
1343            t,
1344            sa,
1345            n,
1346            k,
1347            &mut workspace[workspace_start..workspace_end],
1348            omp_block_start as FastSint,
1349            omp_block_size as FastSint,
1350        );
1351
1352        thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1353        thread_state[omp_thread_num].count = count as FastSint;
1354    }
1355
1356    let mut m = 0;
1357    for t in (0..omp_num_threads).rev() {
1358        m += thread_state[t].count as SaSint;
1359        if t + 1 != omp_num_threads && thread_state[t].count > 0 {
1360            let src_end =
1361                usize::try_from(thread_state[t].position).expect("position must be non-negative");
1362            let src_start = src_end
1363                - usize::try_from(thread_state[t].count).expect("count must be non-negative");
1364            let dst_start = usize::try_from(n - m).expect("destination must be non-negative");
1365            sa.copy_within(src_start..src_end, dst_start);
1366        }
1367    }
1368
1369    let omp_num_threads = omp_num_threads - 1;
1370    let omp_block_stride = (bucket_size_usize / omp_num_threads) & !15usize;
1371    for omp_thread_num in 0..omp_num_threads {
1372        let omp_block_start = omp_thread_num * omp_block_stride;
1373        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
1374            omp_block_stride
1375        } else {
1376            bucket_size_usize - omp_block_start
1377        };
1378        accumulate_counts_s32(
1379            &mut workspace[omp_block_start..],
1380            omp_block_size as FastSint,
1381            bucket_stride,
1382            FastSint::try_from(omp_num_threads + 1).expect("thread count must fit FastSint"),
1383        );
1384    }
1385
1386    let accumulated_start = omp_num_threads * bucket_stride_usize;
1387    buckets[..bucket_size_usize]
1388        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size_usize]);
1389    m
1390}
1391
1392/// Internal helper: count and gather compacted lms suffixes 32s 2k fs (OpenMP variant).
1393#[doc(hidden)]
1394pub fn count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
1395    t: &[SaSint],
1396    sa: &mut [SaSint],
1397    n: SaSint,
1398    k: SaSint,
1399    buckets: &mut [SaSint],
1400    _local_buckets: SaSint,
1401    threads: SaSint,
1402    thread_state: &mut [ThreadState],
1403) {
1404    let n_usize = usize::try_from(n).expect("n must be non-negative");
1405    let k_usize = usize::try_from(k).expect("k must be non-negative");
1406    let thread_count = usize::try_from(threads).expect("threads must be non-negative");
1407    let bucket_size = 2 * k_usize;
1408
1409    if thread_count <= 1 || n < 65_536 {
1410        let _ =
1411            count_and_gather_compacted_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
1412        return;
1413    }
1414
1415    if thread_state.len() < thread_count || sa.len() < 2 * n_usize {
1416        let _ =
1417            count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(t, sa, n, k, buckets, threads);
1418        return;
1419    }
1420
1421    let omp_block_stride = (n_usize / thread_count) & !15usize;
1422    let free_space = if _local_buckets != 0 {
1423        FastSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("free space must fit FastSint")
1424    } else {
1425        FastSint::try_from(buckets.len()).expect("free space must fit FastSint")
1426    };
1427    let bucket_stride = get_bucket_stride(
1428        free_space,
1429        FastSint::try_from(bucket_size).expect("bucket size must fit FastSint"),
1430        FastSint::try_from(thread_count).expect("thread count must fit FastSint"),
1431    );
1432    let bucket_stride_usize =
1433        usize::try_from(bucket_stride).expect("bucket stride must be non-negative");
1434    let workspace_len =
1435        bucket_size + bucket_stride_usize.saturating_mul(thread_count.saturating_sub(1));
1436    let mut workspace = vec![0; workspace_len];
1437
1438    for omp_thread_num in 0..thread_count {
1439        let omp_block_start = omp_thread_num * omp_block_stride;
1440        let omp_block_size = if omp_thread_num + 1 < thread_count {
1441            omp_block_stride
1442        } else {
1443            n_usize - omp_block_start
1444        };
1445
1446        let workspace_end = workspace_len - omp_thread_num * bucket_stride_usize;
1447        let workspace_start = workspace_end - bucket_size;
1448        let count = count_and_gather_compacted_lms_suffixes_32s_2k(
1449            t,
1450            &mut sa[n_usize..],
1451            n,
1452            k,
1453            &mut workspace[workspace_start..workspace_end],
1454            omp_block_start as FastSint,
1455            omp_block_size as FastSint,
1456        );
1457
1458        if omp_thread_num < thread_state.len() {
1459            thread_state[omp_thread_num].position = (omp_block_start + omp_block_size) as FastSint;
1460            thread_state[omp_thread_num].count = count as FastSint;
1461        }
1462    }
1463
1464    let mut m = 0usize;
1465    for omp_thread_num in (0..thread_count).rev() {
1466        let count = usize::try_from(thread_state[omp_thread_num].count)
1467            .expect("count must be non-negative");
1468        m += count;
1469        if count > 0 {
1470            let position = usize::try_from(thread_state[omp_thread_num].position)
1471                .expect("position must be non-negative");
1472            let src_start = n_usize + position - count;
1473            let src_end = n_usize + position;
1474            let dst_start = n_usize - m;
1475            sa.copy_within(src_start..src_end, dst_start);
1476        }
1477    }
1478
1479    let accumulation_threads = thread_count;
1480    let omp_block_stride = (bucket_size / accumulation_threads) & !15usize;
1481    for omp_thread_num in 0..accumulation_threads {
1482        let omp_block_start = omp_thread_num * omp_block_stride;
1483        let omp_block_size = if omp_thread_num + 1 < accumulation_threads {
1484            omp_block_stride
1485        } else {
1486            bucket_size - omp_block_start
1487        };
1488        accumulate_counts_s32(
1489            &mut workspace[omp_block_start..],
1490            omp_block_size as FastSint,
1491            bucket_stride,
1492            FastSint::try_from(thread_count).expect("thread count must fit FastSint"),
1493        );
1494    }
1495    let accumulated_start = (accumulation_threads - 1) * bucket_stride_usize;
1496    buckets[..bucket_size]
1497        .copy_from_slice(&workspace[accumulated_start..accumulated_start + bucket_size]);
1498}
1499
1500/// Internal helper: count and gather lms suffixes 32s 4k (OpenMP variant).
1501#[doc(hidden)]
1502pub fn count_and_gather_lms_suffixes_32s_4k_omp(
1503    t: &[SaSint],
1504    sa: &mut [SaSint],
1505    n: SaSint,
1506    k: SaSint,
1507    buckets: &mut [SaSint],
1508    local_buckets: SaSint,
1509    threads: SaSint,
1510    thread_state: &mut [ThreadState],
1511) -> SaSint {
1512    let free_space = if local_buckets > 1 {
1513        local_buckets as FastSint
1514    } else if local_buckets != 0 {
1515        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1516    } else {
1517        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1518    };
1519    let threads_fast = threads as FastSint;
1520    let mut max_threads = (free_space / (((4 * k as FastSint) + 15) & -16)).min(threads_fast);
1521
1522    if max_threads > 1 && n >= 65_536 && n / k >= 2 {
1523        let thread_cap = (n / (16 * k)) as FastSint;
1524        if max_threads > thread_cap {
1525            max_threads = thread_cap;
1526        }
1527        return count_and_gather_lms_suffixes_32s_4k_fs_omp(
1528            t,
1529            sa,
1530            n,
1531            k,
1532            buckets,
1533            local_buckets,
1534            max_threads.max(2) as SaSint,
1535            thread_state,
1536        );
1537    }
1538
1539    if threads > 1 && n >= 65_536 {
1540        count_lms_suffixes_32s_4k(t, n, k, buckets);
1541        gather_lms_suffixes_32s(t, sa, n)
1542    } else {
1543        count_and_gather_lms_suffixes_32s_4k(t, sa, n, k, buckets, 0, n as FastSint)
1544    }
1545}
1546
1547/// Internal helper: count and gather lms suffixes 32s 2k (OpenMP variant).
1548#[doc(hidden)]
1549pub fn count_and_gather_lms_suffixes_32s_2k_omp(
1550    t: &[SaSint],
1551    sa: &mut [SaSint],
1552    n: SaSint,
1553    k: SaSint,
1554    buckets: &mut [SaSint],
1555    local_buckets: SaSint,
1556    threads: SaSint,
1557    thread_state: &mut [ThreadState],
1558) -> SaSint {
1559    let free_space = if local_buckets > 1 {
1560        local_buckets as FastSint
1561    } else if local_buckets != 0 {
1562        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1563    } else {
1564        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1565    };
1566    let threads_fast = threads as FastSint;
1567    let mut max_threads = (free_space / (((2 * k as FastSint) + 15) & -16)).min(threads_fast);
1568
1569    if max_threads > 1 && n >= 65_536 && n / k >= 2 {
1570        let thread_cap = (n / (8 * k)) as FastSint;
1571        if max_threads > thread_cap {
1572            max_threads = thread_cap;
1573        }
1574        return count_and_gather_lms_suffixes_32s_2k_fs_omp(
1575            t,
1576            sa,
1577            n,
1578            k,
1579            buckets,
1580            local_buckets,
1581            max_threads.max(2) as SaSint,
1582            thread_state,
1583        );
1584    }
1585
1586    if threads > 1 && n >= 65_536 {
1587        count_lms_suffixes_32s_2k(t, n, k, buckets);
1588        gather_lms_suffixes_32s(t, sa, n)
1589    } else {
1590        count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint)
1591    }
1592}
1593
1594/// Internal helper: count and gather compacted lms suffixes 32s 2k (OpenMP variant).
1595#[doc(hidden)]
1596pub fn count_and_gather_compacted_lms_suffixes_32s_2k_omp(
1597    t: &[SaSint],
1598    sa: &mut [SaSint],
1599    n: SaSint,
1600    k: SaSint,
1601    buckets: &mut [SaSint],
1602    local_buckets: SaSint,
1603    threads: SaSint,
1604    thread_state: &mut [ThreadState],
1605) {
1606    let free_space = if local_buckets != 0 {
1607        LIBSAIS_LOCAL_BUFFER_SIZE as FastSint
1608    } else {
1609        FastSint::try_from(buckets.len()).expect("bucket length must fit FastSint")
1610    };
1611    let threads_fast = threads as FastSint;
1612    let mut max_threads = (free_space / (((2 * k as FastSint) + 15) & -16)).min(threads_fast);
1613
1614    if local_buckets == 0 && max_threads > 1 && n >= 65_536 && n / k >= 2 {
1615        let thread_cap = (n / (8 * k)) as FastSint;
1616        if max_threads > thread_cap {
1617            max_threads = thread_cap;
1618        }
1619        count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
1620            t,
1621            sa,
1622            n,
1623            k,
1624            buckets,
1625            local_buckets,
1626            max_threads.max(2) as SaSint,
1627            thread_state,
1628        );
1629        return;
1630    }
1631
1632    let _ = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(t, sa, n, k, buckets, threads);
1633}
1634
1635/// Internal helper: count suffixes 32s.
1636#[doc(hidden)]
1637pub fn count_suffixes_32s(t: &[SaSint], n: SaSint, k: SaSint, buckets: &mut [SaSint]) {
1638    let n_usize = usize::try_from(n).expect("n must be non-negative");
1639    let k_usize = usize::try_from(k).expect("k must be non-negative");
1640    buckets[..k_usize].fill(0);
1641
1642    let mut i = 0usize;
1643    let mut j = n_usize.saturating_sub(7);
1644    while i < j {
1645        buckets[t[i] as usize] += 1;
1646        buckets[t[i + 1] as usize] += 1;
1647        buckets[t[i + 2] as usize] += 1;
1648        buckets[t[i + 3] as usize] += 1;
1649        buckets[t[i + 4] as usize] += 1;
1650        buckets[t[i + 5] as usize] += 1;
1651        buckets[t[i + 6] as usize] += 1;
1652        buckets[t[i + 7] as usize] += 1;
1653        i += 8;
1654    }
1655
1656    j += 7;
1657    while i < j {
1658        buckets[t[i] as usize] += 1;
1659        i += 1;
1660    }
1661}
1662
1663/// Internal helper: initialize buckets start and end 8u.
1664#[doc(hidden)]
1665pub fn initialize_buckets_start_and_end_8u(
1666    buckets: &mut [SaSint],
1667    freq: Option<&mut [SaSint]>,
1668) -> SaSint {
1669    let start_offset = 6 * ALPHABET_SIZE;
1670    let end_offset = 7 * ALPHABET_SIZE;
1671    let mut k = -1isize;
1672    let mut sum = 0;
1673
1674    match freq {
1675        Some(freq) => {
1676            for j in 0..ALPHABET_SIZE {
1677                let i = buckets_index4(j, 0);
1678                let total = buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1679                buckets[start_offset + j] = sum;
1680                sum += total;
1681                buckets[end_offset + j] = sum;
1682                if total > 0 {
1683                    k = j as isize;
1684                }
1685                freq[j] = total;
1686            }
1687        }
1688        None => {
1689            for j in 0..ALPHABET_SIZE {
1690                let i = buckets_index4(j, 0);
1691                let total = buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1692                buckets[start_offset + j] = sum;
1693                sum += total;
1694                buckets[end_offset + j] = sum;
1695                if total > 0 {
1696                    k = j as isize;
1697                }
1698            }
1699        }
1700    }
1701
1702    (k + 1) as SaSint
1703}
1704
1705/// Internal helper: initialize buckets start and end 32s 6k.
1706#[doc(hidden)]
1707pub fn initialize_buckets_start_and_end_32s_6k(k: SaSint, buckets: &mut [SaSint]) {
1708    let k_usize = usize::try_from(k).expect("k must be non-negative");
1709    let start_offset = 4 * k_usize;
1710    let end_offset = 5 * k_usize;
1711    let mut sum = 0;
1712    for j in 0..k_usize {
1713        let i = buckets_index4(j, 0);
1714        buckets[start_offset + j] = sum;
1715        sum += buckets[i] + buckets[i + 1] + buckets[i + 2] + buckets[i + 3];
1716        buckets[end_offset + j] = sum;
1717    }
1718}
1719
1720/// Internal helper: initialize buckets start and end 32s 4k.
1721#[doc(hidden)]
1722pub fn initialize_buckets_start_and_end_32s_4k(k: SaSint, buckets: &mut [SaSint]) {
1723    let k_usize = usize::try_from(k).expect("k must be non-negative");
1724    let start_offset = 2 * k_usize;
1725    let end_offset = 3 * k_usize;
1726    let mut sum = 0;
1727    for j in 0..k_usize {
1728        let i = buckets_index2(j, 0);
1729        buckets[start_offset + j] = sum;
1730        sum += buckets[i] + buckets[i + 1];
1731        buckets[end_offset + j] = sum;
1732    }
1733}
1734
1735/// Internal helper: initialize buckets end 32s 2k.
1736#[doc(hidden)]
1737pub fn initialize_buckets_end_32s_2k(k: SaSint, buckets: &mut [SaSint]) {
1738    let k_usize = usize::try_from(k).expect("k must be non-negative");
1739    let mut sum0 = 0;
1740    for j in 0..k_usize {
1741        let i = buckets_index2(j, 0);
1742        sum0 += buckets[i] + buckets[i + 1];
1743        buckets[i] = sum0;
1744    }
1745}
1746
1747/// Internal helper: initialize buckets start and end 32s 2k.
1748#[doc(hidden)]
1749pub fn initialize_buckets_start_and_end_32s_2k(k: SaSint, buckets: &mut [SaSint]) {
1750    let k_usize = usize::try_from(k).expect("k must be non-negative");
1751    for j in 0..k_usize {
1752        let i = buckets_index2(j, 0);
1753        buckets[j] = buckets[i];
1754    }
1755    buckets[k_usize] = 0;
1756    for j in 1..k_usize {
1757        buckets[k_usize + j] = buckets[j - 1];
1758    }
1759}
1760
1761/// Internal helper: initialize buckets start 32s 1k.
1762#[doc(hidden)]
1763pub fn initialize_buckets_start_32s_1k(k: SaSint, buckets: &mut [SaSint]) {
1764    let k_usize = usize::try_from(k).expect("k must be non-negative");
1765    let mut sum = 0;
1766    for bucket in buckets.iter_mut().take(k_usize) {
1767        let tmp = *bucket;
1768        *bucket = sum;
1769        sum += tmp;
1770    }
1771}
1772
1773/// Internal helper: initialize buckets end 32s 1k.
1774#[doc(hidden)]
1775pub fn initialize_buckets_end_32s_1k(k: SaSint, buckets: &mut [SaSint]) {
1776    let k_usize = usize::try_from(k).expect("k must be non-negative");
1777    let mut sum = 0;
1778    for bucket in buckets.iter_mut().take(k_usize) {
1779        sum += *bucket;
1780        *bucket = sum;
1781    }
1782}
1783
1784/// Internal helper: initialize buckets for lms suffixes radix sort 8u.
1785#[doc(hidden)]
1786pub fn initialize_buckets_for_lms_suffixes_radix_sort_8u(
1787    t: &[u8],
1788    buckets: &mut [SaSint],
1789    mut first_lms_suffix: SaSint,
1790) -> SaSint {
1791    let mut f0 = 0usize;
1792    let mut f1: usize;
1793    let mut c0 = t[first_lms_suffix as usize] as FastSint;
1794    let mut c1: FastSint;
1795
1796    while {
1797        first_lms_suffix -= 1;
1798        first_lms_suffix >= 0
1799    } {
1800        c1 = c0;
1801        c0 = t[first_lms_suffix as usize] as FastSint;
1802        f1 = f0;
1803        f0 = usize::from(c0 > (c1 - f1 as FastSint));
1804        let idx = 4 * c1 as usize + (f1 + f1 + f0);
1805        buckets[idx] -= 1;
1806    }
1807    buckets[4 * c0 as usize + (f0 + f0)] -= 1;
1808
1809    let temp_offset = 4 * ALPHABET_SIZE;
1810    let mut sum = 0;
1811    for j in 0..ALPHABET_SIZE {
1812        let i = 4 * j;
1813        let tj = 2 * j;
1814        buckets[temp_offset + tj + 1] = sum;
1815        sum += buckets[i + 1] + buckets[i + 3];
1816        buckets[temp_offset + tj] = sum;
1817    }
1818    sum
1819}
1820
1821/// Internal helper: initialize buckets for lms suffixes radix sort 32s 2k.
1822#[doc(hidden)]
1823pub fn initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
1824    t: &[SaSint],
1825    k: SaSint,
1826    buckets: &mut [SaSint],
1827    first_lms_suffix: SaSint,
1828) {
1829    let _k_usize = usize::try_from(k).expect("k must be non-negative");
1830    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 0)] += 1;
1831    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 1)] -= 1;
1832
1833    let mut sum0 = 0;
1834    let mut sum1 = 0;
1835    for j in 0..usize::try_from(k).unwrap() {
1836        let i = buckets_index2(j, 0);
1837        sum0 += buckets[i] + buckets[i + 1];
1838        sum1 += buckets[i + 1];
1839        buckets[i] = sum0;
1840        buckets[i + 1] = sum1;
1841    }
1842}
1843
1844/// Internal helper: initialize buckets for lms suffixes radix sort 32s 6k.
1845#[doc(hidden)]
1846pub fn initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
1847    t: &[SaSint],
1848    k: SaSint,
1849    buckets: &mut [SaSint],
1850    mut first_lms_suffix: SaSint,
1851) -> SaSint {
1852    let mut f0 = 0usize;
1853    let mut f1: usize;
1854    let mut c0 = t[first_lms_suffix as usize] as FastSint;
1855    let mut c1: FastSint;
1856
1857    while {
1858        first_lms_suffix -= 1;
1859        first_lms_suffix >= 0
1860    } {
1861        c1 = c0;
1862        c0 = t[first_lms_suffix as usize] as FastSint;
1863        f1 = f0;
1864        f0 = usize::from(c0 > (c1 - f1 as FastSint));
1865        buckets[4 * c1 as usize + (f1 + f1 + f0)] -= 1;
1866    }
1867    buckets[4 * c0 as usize + (f0 + f0)] -= 1;
1868
1869    let temp_offset = 4 * usize::try_from(k).unwrap();
1870    let mut sum = 0;
1871    for j in 0..usize::try_from(k).unwrap() {
1872        let i = 4 * j;
1873        sum += buckets[i + 1] + buckets[i + 3];
1874        buckets[temp_offset + j] = sum;
1875    }
1876    sum
1877}
1878
1879/// Internal helper: initialize buckets for radix and partial sorting 32s 4k.
1880#[doc(hidden)]
1881pub fn initialize_buckets_for_radix_and_partial_sorting_32s_4k(
1882    t: &[SaSint],
1883    k: SaSint,
1884    buckets: &mut [SaSint],
1885    first_lms_suffix: SaSint,
1886) {
1887    let k_usize = usize::try_from(k).expect("k must be non-negative");
1888    let start_offset = 2 * k_usize;
1889    let end_offset = 3 * k_usize;
1890
1891    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 0)] += 1;
1892    buckets[buckets_index2(t[first_lms_suffix as usize] as usize, 1)] -= 1;
1893
1894    let mut sum0 = 0;
1895    let mut sum1 = 0;
1896    for j in 0..k_usize {
1897        let i = buckets_index2(j, 0);
1898        buckets[start_offset + j] = sum1;
1899        sum0 += buckets[i + 1];
1900        sum1 += buckets[i] + buckets[i + 1];
1901        buckets[i + 1] = sum0;
1902        buckets[end_offset + j] = sum1;
1903    }
1904}
1905
1906/// Internal helper: radix sort lms suffixes 8u.
1907#[doc(hidden)]
1908pub fn radix_sort_lms_suffixes_8u(
1909    t: &[u8],
1910    sa: &mut [SaSint],
1911    induction_bucket: &mut [SaSint],
1912    omp_block_start: FastSint,
1913    omp_block_size: FastSint,
1914) {
1915    let prefetch_distance = 64 as FastSint;
1916    let mut i = omp_block_start + omp_block_size - 1;
1917    let mut j = omp_block_start + prefetch_distance + 3;
1918
1919    while i >= j {
1920        let p0 = sa[i as usize];
1921        let idx0 = buckets_index2(t[p0 as usize] as usize, 0);
1922        induction_bucket[idx0] -= 1;
1923        sa[induction_bucket[idx0] as usize] = p0;
1924
1925        let p1 = sa[(i - 1) as usize];
1926        let idx1 = buckets_index2(t[p1 as usize] as usize, 0);
1927        induction_bucket[idx1] -= 1;
1928        sa[induction_bucket[idx1] as usize] = p1;
1929
1930        let p2 = sa[(i - 2) as usize];
1931        let idx2 = buckets_index2(t[p2 as usize] as usize, 0);
1932        induction_bucket[idx2] -= 1;
1933        sa[induction_bucket[idx2] as usize] = p2;
1934
1935        let p3 = sa[(i - 3) as usize];
1936        let idx3 = buckets_index2(t[p3 as usize] as usize, 0);
1937        induction_bucket[idx3] -= 1;
1938        sa[induction_bucket[idx3] as usize] = p3;
1939
1940        i -= 4;
1941    }
1942
1943    j -= prefetch_distance + 3;
1944    while i >= j {
1945        let p = sa[i as usize];
1946        let idx = buckets_index2(t[p as usize] as usize, 0);
1947        induction_bucket[idx] -= 1;
1948        sa[induction_bucket[idx] as usize] = p;
1949        i -= 1;
1950    }
1951}
1952
1953/// Internal helper: radix sort lms suffixes 8u (OpenMP variant).
1954#[doc(hidden)]
1955pub fn radix_sort_lms_suffixes_8u_omp(
1956    t: &[u8],
1957    sa: &mut [SaSint],
1958    n: SaSint,
1959    m: SaSint,
1960    flags: SaSint,
1961    buckets: &mut [SaSint],
1962    threads: SaSint,
1963    thread_state: &mut [ThreadState],
1964) {
1965    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
1966        buckets[4 * ALPHABET_SIZE] -= 1;
1967    }
1968
1969    let omp_num_threads = if threads > 1 && n >= 65_536 && m >= 65_536 {
1970        usize::try_from(threads)
1971            .expect("threads must be non-negative")
1972            .min(thread_state.len())
1973            .max(1)
1974    } else {
1975        1
1976    };
1977
1978    if omp_num_threads == 1 {
1979        radix_sort_lms_suffixes_8u(
1980            t,
1981            sa,
1982            &mut buckets[4 * ALPHABET_SIZE..],
1983            n as FastSint - m as FastSint + 1,
1984            m as FastSint - 1,
1985        );
1986        return;
1987    }
1988
1989    let (_, src_bucket) = buckets.split_at_mut(4 * ALPHABET_SIZE);
1990
1991    for state in thread_state.iter_mut().take(omp_num_threads) {
1992        for (i, j) in (0..=buckets_index2(ALPHABET_SIZE - 1, 0))
1993            .step_by(buckets_index2(1, 0))
1994            .zip((buckets_index4(0, 1)..).step_by(buckets_index4(1, 0)))
1995        {
1996            state.buckets[i] = src_bucket[i] - state.buckets[j];
1997        }
1998    }
1999
2000    for thread_num in 0..omp_num_threads {
2001        let mut omp_block_start = 0;
2002        for state in thread_state
2003            .iter()
2004            .take(omp_num_threads)
2005            .skip(thread_num)
2006            .rev()
2007        {
2008            omp_block_start += state.m;
2009        }
2010
2011        let mut omp_block_size = thread_state[thread_num].m;
2012        if omp_block_start == m as FastSint && omp_block_size > 0 {
2013            omp_block_start -= 1;
2014            omp_block_size -= 1;
2015        }
2016
2017        radix_sort_lms_suffixes_8u(
2018            t,
2019            sa,
2020            &mut thread_state[thread_num].buckets,
2021            n as FastSint - omp_block_start,
2022            omp_block_size,
2023        );
2024    }
2025}
2026
2027/// Internal helper: radix sort lms suffixes 32s 6k.
2028#[doc(hidden)]
2029pub fn radix_sort_lms_suffixes_32s_6k(
2030    t: &[SaSint],
2031    sa: &mut [SaSint],
2032    induction_bucket: &mut [SaSint],
2033    omp_block_start: FastSint,
2034    omp_block_size: FastSint,
2035) {
2036    let prefetch_distance = 64 as FastSint;
2037    let mut i = omp_block_start + omp_block_size - 1;
2038    let mut j = omp_block_start + 2 * prefetch_distance + 3;
2039
2040    while i >= j {
2041        let p0 = sa[i as usize];
2042        let idx0 = t[p0 as usize] as usize;
2043        induction_bucket[idx0] -= 1;
2044        sa[induction_bucket[idx0] as usize] = p0;
2045
2046        let p1 = sa[(i - 1) as usize];
2047        let idx1 = t[p1 as usize] as usize;
2048        induction_bucket[idx1] -= 1;
2049        sa[induction_bucket[idx1] as usize] = p1;
2050
2051        let p2 = sa[(i - 2) as usize];
2052        let idx2 = t[p2 as usize] as usize;
2053        induction_bucket[idx2] -= 1;
2054        sa[induction_bucket[idx2] as usize] = p2;
2055
2056        let p3 = sa[(i - 3) as usize];
2057        let idx3 = t[p3 as usize] as usize;
2058        induction_bucket[idx3] -= 1;
2059        sa[induction_bucket[idx3] as usize] = p3;
2060
2061        i -= 4;
2062    }
2063
2064    j -= 2 * prefetch_distance + 3;
2065    while i >= j {
2066        let p = sa[i as usize];
2067        let idx = t[p as usize] as usize;
2068        induction_bucket[idx] -= 1;
2069        sa[induction_bucket[idx] as usize] = p;
2070        i -= 1;
2071    }
2072}
2073
2074/// Internal helper: radix sort lms suffixes 32s 2k.
2075#[doc(hidden)]
2076pub fn radix_sort_lms_suffixes_32s_2k(
2077    t: &[SaSint],
2078    sa: &mut [SaSint],
2079    induction_bucket: &mut [SaSint],
2080    omp_block_start: FastSint,
2081    omp_block_size: FastSint,
2082) {
2083    let prefetch_distance = 64 as FastSint;
2084    let mut i = omp_block_start + omp_block_size - 1;
2085    let mut j = omp_block_start + 2 * prefetch_distance + 3;
2086
2087    while i >= j {
2088        let p0 = sa[i as usize];
2089        let idx0 = buckets_index2(t[p0 as usize] as usize, 0);
2090        induction_bucket[idx0] -= 1;
2091        sa[induction_bucket[idx0] as usize] = p0;
2092
2093        let p1 = sa[(i - 1) as usize];
2094        let idx1 = buckets_index2(t[p1 as usize] as usize, 0);
2095        induction_bucket[idx1] -= 1;
2096        sa[induction_bucket[idx1] as usize] = p1;
2097
2098        let p2 = sa[(i - 2) as usize];
2099        let idx2 = buckets_index2(t[p2 as usize] as usize, 0);
2100        induction_bucket[idx2] -= 1;
2101        sa[induction_bucket[idx2] as usize] = p2;
2102
2103        let p3 = sa[(i - 3) as usize];
2104        let idx3 = buckets_index2(t[p3 as usize] as usize, 0);
2105        induction_bucket[idx3] -= 1;
2106        sa[induction_bucket[idx3] as usize] = p3;
2107
2108        i -= 4;
2109    }
2110
2111    j -= 2 * prefetch_distance + 3;
2112    while i >= j {
2113        let p = sa[i as usize];
2114        let idx = buckets_index2(t[p as usize] as usize, 0);
2115        induction_bucket[idx] -= 1;
2116        sa[induction_bucket[idx] as usize] = p;
2117        i -= 1;
2118    }
2119}
2120
2121/// Internal helper: radix sort lms suffixes 32s block gather.
2122#[doc(hidden)]
2123pub fn radix_sort_lms_suffixes_32s_block_gather(
2124    t: &[SaSint],
2125    sa: &[SaSint],
2126    cache: &mut [ThreadCache],
2127    omp_block_start: FastSint,
2128    omp_block_size: FastSint,
2129) {
2130    let start = usize::try_from(omp_block_start).expect("block start must be non-negative");
2131    let mut i = omp_block_start;
2132    let mut j = omp_block_start + omp_block_size - 64 - 3;
2133
2134    while i < j {
2135        for current in [i, i + 1, i + 2, i + 3] {
2136            let ci = current as usize - start;
2137            let index = sa[current as usize];
2138            cache[ci].index = index;
2139            cache[ci].symbol = t[index as usize];
2140        }
2141        i += 4;
2142    }
2143
2144    j += 64 + 3;
2145    while i < j {
2146        let ci = i as usize - start;
2147        let index = sa[i as usize];
2148        cache[ci].index = index;
2149        cache[ci].symbol = t[index as usize];
2150        i += 1;
2151    }
2152}
2153
2154/// Internal helper: radix sort lms suffixes 32s 6k block sort.
2155#[doc(hidden)]
2156pub fn radix_sort_lms_suffixes_32s_6k_block_sort(
2157    induction_bucket: &mut [SaSint],
2158    cache: &mut [ThreadCache],
2159    omp_block_start: FastSint,
2160    omp_block_size: FastSint,
2161) {
2162    let start = usize::try_from(omp_block_start).expect("block start must be non-negative");
2163    let mut i = omp_block_start + omp_block_size - 1;
2164    let mut j = omp_block_start + 64 + 3;
2165
2166    while i >= j {
2167        for current in [i, i - 1, i - 2, i - 3] {
2168            let ci = current as usize - start;
2169            let v = cache[ci].symbol as usize;
2170            induction_bucket[v] -= 1;
2171            cache[ci].symbol = induction_bucket[v];
2172        }
2173        i -= 4;
2174    }
2175
2176    j -= 64 + 3;
2177    while i >= j {
2178        let ci = i as usize - start;
2179        let v = cache[ci].symbol as usize;
2180        induction_bucket[v] -= 1;
2181        cache[ci].symbol = induction_bucket[v];
2182        i -= 1;
2183    }
2184}
2185
2186/// Internal helper: radix sort lms suffixes 32s 2k block sort.
2187#[doc(hidden)]
2188pub fn radix_sort_lms_suffixes_32s_2k_block_sort(
2189    induction_bucket: &mut [SaSint],
2190    cache: &mut [ThreadCache],
2191    omp_block_start: FastSint,
2192    omp_block_size: FastSint,
2193) {
2194    let start = usize::try_from(omp_block_start).expect("block start must be non-negative");
2195    let mut i = omp_block_start + omp_block_size - 1;
2196    let mut j = omp_block_start + 64 + 3;
2197
2198    while i >= j {
2199        for current in [i, i - 1, i - 2, i - 3] {
2200            let ci = current as usize - start;
2201            let v = buckets_index2(cache[ci].symbol as usize, 0);
2202            induction_bucket[v] -= 1;
2203            cache[ci].symbol = induction_bucket[v];
2204        }
2205        i -= 4;
2206    }
2207
2208    j -= 64 + 3;
2209    while i >= j {
2210        let ci = i as usize - start;
2211        let v = buckets_index2(cache[ci].symbol as usize, 0);
2212        induction_bucket[v] -= 1;
2213        cache[ci].symbol = induction_bucket[v];
2214        i -= 1;
2215    }
2216}
2217
2218/// Internal helper: radix sort lms suffixes 32s 6k block (OpenMP variant).
2219#[doc(hidden)]
2220pub fn radix_sort_lms_suffixes_32s_6k_block_omp(
2221    t: &[SaSint],
2222    sa: &mut [SaSint],
2223    induction_bucket: &mut [SaSint],
2224    cache: &mut [ThreadCache],
2225    block_start: FastSint,
2226    block_size: FastSint,
2227    threads: SaSint,
2228) {
2229    if threads <= 1 || block_size < 16_384 {
2230        radix_sort_lms_suffixes_32s_6k(t, sa, induction_bucket, block_start, block_size);
2231        return;
2232    }
2233
2234    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
2235    let threads_usize = usize::try_from(threads)
2236        .expect("threads must be positive")
2237        .min(block_size_usize.max(1));
2238    let omp_block_stride = (block_size_usize / threads_usize) & !15usize;
2239
2240    for omp_thread_num in 0..threads_usize {
2241        let omp_block_start = omp_thread_num * omp_block_stride;
2242        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2243            omp_block_stride
2244        } else {
2245            block_size_usize - omp_block_start
2246        };
2247        if omp_block_size > 0 {
2248            radix_sort_lms_suffixes_32s_block_gather(
2249                t,
2250                sa,
2251                &mut cache[omp_block_start..],
2252                block_start + omp_block_start as FastSint,
2253                omp_block_size as FastSint,
2254            );
2255        }
2256    }
2257
2258    radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache, block_start, block_size);
2259
2260    for omp_thread_num in 0..threads_usize {
2261        let omp_block_start = omp_thread_num * omp_block_stride;
2262        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2263            omp_block_stride
2264        } else {
2265            block_size_usize - omp_block_start
2266        };
2267        if omp_block_size > 0 {
2268            place_cached_suffixes(sa, &cache[omp_block_start..], 0, omp_block_size as FastSint);
2269        }
2270    }
2271}
2272
2273/// Internal helper: radix sort lms suffixes 32s 2k block (OpenMP variant).
2274#[doc(hidden)]
2275pub fn radix_sort_lms_suffixes_32s_2k_block_omp(
2276    t: &[SaSint],
2277    sa: &mut [SaSint],
2278    induction_bucket: &mut [SaSint],
2279    cache: &mut [ThreadCache],
2280    block_start: FastSint,
2281    block_size: FastSint,
2282    threads: SaSint,
2283) {
2284    if threads <= 1 || block_size < 16_384 {
2285        radix_sort_lms_suffixes_32s_2k(t, sa, induction_bucket, block_start, block_size);
2286        return;
2287    }
2288
2289    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
2290    let threads_usize = usize::try_from(threads)
2291        .expect("threads must be positive")
2292        .min(block_size_usize.max(1));
2293    let omp_block_stride = (block_size_usize / threads_usize) & !15usize;
2294
2295    for omp_thread_num in 0..threads_usize {
2296        let omp_block_start = omp_thread_num * omp_block_stride;
2297        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2298            omp_block_stride
2299        } else {
2300            block_size_usize - omp_block_start
2301        };
2302        if omp_block_size > 0 {
2303            radix_sort_lms_suffixes_32s_block_gather(
2304                t,
2305                sa,
2306                &mut cache[omp_block_start..],
2307                block_start + omp_block_start as FastSint,
2308                omp_block_size as FastSint,
2309            );
2310        }
2311    }
2312
2313    radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache, block_start, block_size);
2314
2315    for omp_thread_num in 0..threads_usize {
2316        let omp_block_start = omp_thread_num * omp_block_stride;
2317        let omp_block_size = if omp_thread_num + 1 < threads_usize {
2318            omp_block_stride
2319        } else {
2320            block_size_usize - omp_block_start
2321        };
2322        if omp_block_size > 0 {
2323            place_cached_suffixes(sa, &cache[omp_block_start..], 0, omp_block_size as FastSint);
2324        }
2325    }
2326}
2327
2328/// Internal helper: radix sort lms suffixes 32s 6k (OpenMP variant).
2329#[doc(hidden)]
2330pub fn radix_sort_lms_suffixes_32s_6k_omp(
2331    t: &[SaSint],
2332    sa: &mut [SaSint],
2333    n: SaSint,
2334    m: SaSint,
2335    induction_bucket: &mut [SaSint],
2336    threads: SaSint,
2337    _thread_state: &mut [ThreadState],
2338) {
2339    if threads <= 1 || m < 65_536 {
2340        radix_sort_lms_suffixes_32s_6k(
2341            t,
2342            sa,
2343            induction_bucket,
2344            n as FastSint - m as FastSint + 1,
2345            m as FastSint - 1,
2346        );
2347        return;
2348    }
2349
2350    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2351    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
2352    let mut block_start = 0usize;
2353    let m_usize = usize::try_from(m).expect("m must be non-negative");
2354    let n_usize = usize::try_from(n).expect("n must be non-negative");
2355    let last = m_usize - 1;
2356
2357    while block_start < last {
2358        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(last);
2359        radix_sort_lms_suffixes_32s_6k_block_omp(
2360            t,
2361            sa,
2362            induction_bucket,
2363            &mut cache,
2364            (n_usize - block_end) as FastSint,
2365            (block_end - block_start) as FastSint,
2366            threads,
2367        );
2368        block_start = block_end;
2369    }
2370}
2371
2372/// Internal helper: radix sort lms suffixes 32s 2k (OpenMP variant).
2373#[doc(hidden)]
2374pub fn radix_sort_lms_suffixes_32s_2k_omp(
2375    t: &[SaSint],
2376    sa: &mut [SaSint],
2377    n: SaSint,
2378    m: SaSint,
2379    induction_bucket: &mut [SaSint],
2380    threads: SaSint,
2381    _thread_state: &mut [ThreadState],
2382) {
2383    if threads <= 1 || m < 65_536 {
2384        radix_sort_lms_suffixes_32s_2k(
2385            t,
2386            sa,
2387            induction_bucket,
2388            n as FastSint - m as FastSint + 1,
2389            m as FastSint - 1,
2390        );
2391        return;
2392    }
2393
2394    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2395    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
2396    let mut block_start = 0usize;
2397    let m_usize = usize::try_from(m).expect("m must be non-negative");
2398    let n_usize = usize::try_from(n).expect("n must be non-negative");
2399    let last = m_usize - 1;
2400
2401    while block_start < last {
2402        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(last);
2403        radix_sort_lms_suffixes_32s_2k_block_omp(
2404            t,
2405            sa,
2406            induction_bucket,
2407            &mut cache,
2408            (n_usize - block_end) as FastSint,
2409            (block_end - block_start) as FastSint,
2410            threads,
2411        );
2412        block_start = block_end;
2413    }
2414}
2415
2416/// Internal helper: radix sort lms suffixes 32s 1k.
2417#[doc(hidden)]
2418pub fn radix_sort_lms_suffixes_32s_1k(
2419    t: &[SaSint],
2420    sa: &mut [SaSint],
2421    n: SaSint,
2422    buckets: &mut [SaSint],
2423) -> SaSint {
2424    let n_usize = usize::try_from(n).expect("n must be non-negative");
2425    let mut i = n as FastSint - 2;
2426    let mut m = 0;
2427    let mut f0 = 1usize;
2428    let mut f1: usize;
2429    let mut c0 = t[n_usize - 1] as FastSint;
2430    let mut c1: FastSint;
2431    let mut c2 = 0 as FastSint;
2432
2433    while i >= 67 {
2434        c1 = t[i as usize] as FastSint;
2435        f1 = usize::from(c1 > (c0 - f0 as FastSint));
2436        if (f1 & !f0) != 0 {
2437            c2 = c0;
2438            buckets[c2 as usize] -= 1;
2439            sa[buckets[c2 as usize] as usize] = (i + 1) as SaSint;
2440            m += 1;
2441        }
2442
2443        c0 = t[(i - 1) as usize] as FastSint;
2444        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2445        if (f0 & !f1) != 0 {
2446            c2 = c1;
2447            buckets[c2 as usize] -= 1;
2448            sa[buckets[c2 as usize] as usize] = i as SaSint;
2449            m += 1;
2450        }
2451
2452        c1 = t[(i - 2) as usize] as FastSint;
2453        f1 = usize::from(c1 > (c0 - f0 as FastSint));
2454        if (f1 & !f0) != 0 {
2455            c2 = c0;
2456            buckets[c2 as usize] -= 1;
2457            sa[buckets[c2 as usize] as usize] = (i - 1) as SaSint;
2458            m += 1;
2459        }
2460
2461        c0 = t[(i - 3) as usize] as FastSint;
2462        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2463        if (f0 & !f1) != 0 {
2464            c2 = c1;
2465            buckets[c2 as usize] -= 1;
2466            sa[buckets[c2 as usize] as usize] = (i - 2) as SaSint;
2467            m += 1;
2468        }
2469
2470        i -= 4;
2471    }
2472
2473    while i >= 0 {
2474        c1 = c0;
2475        c0 = t[i as usize] as FastSint;
2476        f1 = f0;
2477        f0 = usize::from(c0 > (c1 - f1 as FastSint));
2478        if (f0 & !f1) != 0 {
2479            c2 = c1;
2480            buckets[c2 as usize] -= 1;
2481            sa[buckets[c2 as usize] as usize] = (i + 1) as SaSint;
2482            m += 1;
2483        }
2484        i -= 1;
2485    }
2486
2487    if m > 1 {
2488        sa[buckets[c2 as usize] as usize] = 0;
2489    }
2490
2491    m
2492}
2493
2494/// Internal helper: radix sort set markers 32s 6k.
2495#[doc(hidden)]
2496pub fn radix_sort_set_markers_32s_6k(
2497    sa: &mut [SaSint],
2498    induction_bucket: &[SaSint],
2499    omp_block_start: FastSint,
2500    omp_block_size: FastSint,
2501) {
2502    let mut i = omp_block_start;
2503    let mut j = omp_block_start + omp_block_size - 67;
2504
2505    while i < j {
2506        sa[induction_bucket[i as usize] as usize] |= SAINT_MIN;
2507        sa[induction_bucket[(i + 1) as usize] as usize] |= SAINT_MIN;
2508        sa[induction_bucket[(i + 2) as usize] as usize] |= SAINT_MIN;
2509        sa[induction_bucket[(i + 3) as usize] as usize] |= SAINT_MIN;
2510        i += 4;
2511    }
2512
2513    j += 67;
2514    while i < j {
2515        sa[induction_bucket[i as usize] as usize] |= SAINT_MIN;
2516        i += 1;
2517    }
2518}
2519
2520/// Internal helper: radix sort set markers 32s 4k.
2521#[doc(hidden)]
2522pub fn radix_sort_set_markers_32s_4k(
2523    sa: &mut [SaSint],
2524    induction_bucket: &[SaSint],
2525    omp_block_start: FastSint,
2526    omp_block_size: FastSint,
2527) {
2528    let mut i = omp_block_start;
2529    let mut j = omp_block_start + omp_block_size - 67;
2530
2531    while i < j {
2532        sa[induction_bucket[buckets_index2(i as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2533        sa[induction_bucket[buckets_index2((i + 1) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2534        sa[induction_bucket[buckets_index2((i + 2) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2535        sa[induction_bucket[buckets_index2((i + 3) as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2536        i += 4;
2537    }
2538
2539    j += 67;
2540    while i < j {
2541        sa[induction_bucket[buckets_index2(i as usize, 0)] as usize] |= SUFFIX_GROUP_MARKER;
2542        i += 1;
2543    }
2544}
2545
2546/// Internal helper: radix sort set markers 32s 6k (OpenMP variant).
2547#[doc(hidden)]
2548pub fn radix_sort_set_markers_32s_6k_omp(
2549    sa: &mut [SaSint],
2550    k: SaSint,
2551    induction_bucket: &[SaSint],
2552    threads: SaSint,
2553) {
2554    if k <= 1 {
2555        return;
2556    }
2557
2558    if threads <= 1 || k < 65_536 {
2559        radix_sort_set_markers_32s_6k(sa, induction_bucket, 0, k as FastSint - 1);
2560        return;
2561    }
2562
2563    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2564    let last = usize::try_from(k - 1).expect("k must be positive");
2565    let stride = (last / threads_usize) & !15usize;
2566    let mut start = 0usize;
2567
2568    for thread in 0..threads_usize {
2569        let end = if thread + 1 == threads_usize {
2570            last
2571        } else {
2572            start + stride
2573        };
2574        if end > start {
2575            radix_sort_set_markers_32s_6k(
2576                sa,
2577                induction_bucket,
2578                start as FastSint,
2579                (end - start) as FastSint,
2580            );
2581        }
2582        start = end;
2583    }
2584}
2585
2586/// Internal helper: radix sort set markers 32s 4k (OpenMP variant).
2587#[doc(hidden)]
2588pub fn radix_sort_set_markers_32s_4k_omp(
2589    sa: &mut [SaSint],
2590    k: SaSint,
2591    induction_bucket: &[SaSint],
2592    threads: SaSint,
2593) {
2594    if k <= 1 {
2595        return;
2596    }
2597
2598    if threads <= 1 || k < 65_536 {
2599        radix_sort_set_markers_32s_4k(sa, induction_bucket, 0, k as FastSint - 1);
2600        return;
2601    }
2602
2603    let threads_usize = usize::try_from(threads).expect("threads must be positive");
2604    let last = usize::try_from(k - 1).expect("k must be positive");
2605    let stride = (last / threads_usize) & !15usize;
2606    let mut start = 0usize;
2607
2608    for thread in 0..threads_usize {
2609        let end = if thread + 1 == threads_usize {
2610            last
2611        } else {
2612            start + stride
2613        };
2614        if end > start {
2615            radix_sort_set_markers_32s_4k(
2616                sa,
2617                induction_bucket,
2618                start as FastSint,
2619                (end - start) as FastSint,
2620            );
2621        }
2622        start = end;
2623    }
2624}
2625
2626/// Internal helper: initialize buckets for partial sorting 8u.
2627#[doc(hidden)]
2628pub fn initialize_buckets_for_partial_sorting_8u(
2629    t: &[u8],
2630    buckets: &mut [SaSint],
2631    first_lms_suffix: SaSint,
2632    left_suffixes_count: SaSint,
2633) {
2634    let temp_offset = 4 * ALPHABET_SIZE;
2635    buckets[buckets_index4(t[first_lms_suffix as usize] as usize, 1)] += 1;
2636
2637    let mut sum0 = left_suffixes_count + 1;
2638    let mut sum1 = 0;
2639    for j in 0..ALPHABET_SIZE {
2640        let i = buckets_index4(j, 0);
2641        let tj = buckets_index2(j, 0);
2642        buckets[temp_offset + tj] = sum0;
2643        sum0 += buckets[i] + buckets[i + 2];
2644        sum1 += buckets[i + 1];
2645        buckets[tj] = sum0;
2646        buckets[tj + 1] = sum1;
2647    }
2648}
2649
2650/// Internal helper: initialize buckets for partial sorting 32s 6k.
2651#[doc(hidden)]
2652pub fn initialize_buckets_for_partial_sorting_32s_6k(
2653    t: &[SaSint],
2654    k: SaSint,
2655    buckets: &mut [SaSint],
2656    first_lms_suffix: SaSint,
2657    left_suffixes_count: SaSint,
2658) {
2659    let k_usize = usize::try_from(k).expect("k must be non-negative");
2660    let temp_offset = 4 * k_usize;
2661    let first_symbol = t[first_lms_suffix as usize] as usize;
2662    let mut sum0 = left_suffixes_count + 1;
2663    let mut sum1 = 0;
2664    let mut sum2 = 0;
2665
2666    for j in 0..first_symbol {
2667        let i = buckets_index4(j, 0);
2668        let tj = buckets_index2(j, 0);
2669        let ss = buckets[i];
2670        let ls = buckets[i + 1];
2671        let sl = buckets[i + 2];
2672        let ll = buckets[i + 3];
2673
2674        buckets[i] = sum0;
2675        buckets[i + 1] = sum2;
2676        buckets[i + 2] = 0;
2677        buckets[i + 3] = 0;
2678
2679        sum0 += ss + sl;
2680        sum1 += ls;
2681        sum2 += ls + ll;
2682
2683        buckets[temp_offset + tj] = sum0;
2684        buckets[temp_offset + tj + 1] = sum1;
2685    }
2686
2687    sum1 += 1;
2688    for j in first_symbol..k_usize {
2689        let i = buckets_index4(j, 0);
2690        let tj = buckets_index2(j, 0);
2691        let ss = buckets[i];
2692        let ls = buckets[i + 1];
2693        let sl = buckets[i + 2];
2694        let ll = buckets[i + 3];
2695
2696        buckets[i] = sum0;
2697        buckets[i + 1] = sum2;
2698        buckets[i + 2] = 0;
2699        buckets[i + 3] = 0;
2700
2701        sum0 += ss + sl;
2702        sum1 += ls;
2703        sum2 += ls + ll;
2704
2705        buckets[temp_offset + tj] = sum0;
2706        buckets[temp_offset + tj + 1] = sum1;
2707    }
2708}
2709
2710/// Internal helper: partial sorting scan left to right 8u.
2711#[doc(hidden)]
2712pub fn partial_sorting_scan_left_to_right_8u(
2713    t: &[u8],
2714    sa: &mut [SaSint],
2715    buckets: &mut [SaSint],
2716    mut d: SaSint,
2717    omp_block_start: FastSint,
2718    omp_block_size: FastSint,
2719) -> SaSint {
2720    let induction_offset = 4 * ALPHABET_SIZE;
2721    let distinct_offset = 2 * ALPHABET_SIZE;
2722    let prefetch_distance = 64 as FastSint;
2723    let mut i = omp_block_start;
2724    let mut j = if omp_block_size > prefetch_distance + 1 {
2725        omp_block_start + omp_block_size - prefetch_distance - 1
2726    } else {
2727        omp_block_start
2728    };
2729
2730    while i < j {
2731        let mut p0 = sa[i as usize];
2732        d += SaSint::from(p0 < 0);
2733        p0 &= SAINT_MAX;
2734        let v0 = buckets_index2(
2735            t[(p0 - 1) as usize] as usize,
2736            usize::from(t[(p0 - 2) as usize] >= t[(p0 - 1) as usize]),
2737        );
2738        let pos0 = buckets[induction_offset + v0] as usize;
2739        sa[pos0] = (p0 - 1) | (((buckets[distinct_offset + v0] != d) as SaSint) << (SAINT_BIT - 1));
2740        buckets[induction_offset + v0] += 1;
2741        buckets[distinct_offset + v0] = d;
2742
2743        let mut p1 = sa[(i + 1) as usize];
2744        d += SaSint::from(p1 < 0);
2745        p1 &= SAINT_MAX;
2746        let v1 = buckets_index2(
2747            t[(p1 - 1) as usize] as usize,
2748            usize::from(t[(p1 - 2) as usize] >= t[(p1 - 1) as usize]),
2749        );
2750        let pos1 = buckets[induction_offset + v1] as usize;
2751        sa[pos1] = (p1 - 1) | (((buckets[distinct_offset + v1] != d) as SaSint) << (SAINT_BIT - 1));
2752        buckets[induction_offset + v1] += 1;
2753        buckets[distinct_offset + v1] = d;
2754
2755        i += 2;
2756    }
2757
2758    j = omp_block_start + omp_block_size;
2759    while i < j {
2760        let mut p = sa[i as usize];
2761        d += SaSint::from(p < 0);
2762        p &= SAINT_MAX;
2763        let v = buckets_index2(
2764            t[(p - 1) as usize] as usize,
2765            usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
2766        );
2767        let pos = buckets[induction_offset + v] as usize;
2768        sa[pos] = (p - 1) | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
2769        buckets[induction_offset + v] += 1;
2770        buckets[distinct_offset + v] = d;
2771        i += 1;
2772    }
2773
2774    d
2775}
2776
2777/// Internal helper: partial sorting scan left to right 8u (OpenMP variant).
2778#[doc(hidden)]
2779pub fn partial_sorting_scan_left_to_right_8u_omp(
2780    t: &[u8],
2781    sa: &mut [SaSint],
2782    n: SaSint,
2783    k: SaSint,
2784    buckets: &mut [SaSint],
2785    left_suffixes_count: SaSint,
2786    mut d: SaSint,
2787    threads: SaSint,
2788    thread_state: &mut [ThreadState],
2789) -> SaSint {
2790    let v = buckets_index2(
2791        t[(n - 1) as usize] as usize,
2792        usize::from(t[(n - 2) as usize] >= t[(n - 1) as usize]),
2793    );
2794    let induction_offset = 4 * ALPHABET_SIZE;
2795    let distinct_offset = 2 * ALPHABET_SIZE;
2796    let pos = buckets[induction_offset + v] as usize;
2797    sa[pos] = (n - 1) | SAINT_MIN;
2798    buckets[induction_offset + v] += 1;
2799    d += 1;
2800    buckets[distinct_offset + v] = d;
2801
2802    if threads == 1 || left_suffixes_count < 65_536 {
2803        return partial_sorting_scan_left_to_right_8u(
2804            t,
2805            sa,
2806            buckets,
2807            d,
2808            0,
2809            left_suffixes_count as FastSint,
2810        );
2811    }
2812
2813    let mut block_start = 0usize;
2814    let left_suffixes_count =
2815        usize::try_from(left_suffixes_count).expect("left_suffixes_count must be non-negative");
2816    let threads_usize = usize::try_from(threads)
2817        .expect("threads must be non-negative")
2818        .min(thread_state.len())
2819        .max(1);
2820    while block_start < left_suffixes_count {
2821        if sa[block_start] == 0 {
2822            block_start += 1;
2823        } else {
2824            let mut block_max_end =
2825                block_start + threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
2826            if block_max_end > left_suffixes_count {
2827                block_max_end = left_suffixes_count;
2828            }
2829            let mut block_end = block_start + 1;
2830            while block_end < block_max_end && sa[block_end] != 0 {
2831                block_end += 1;
2832            }
2833            let block_size = block_end - block_start;
2834
2835            if block_size < 32 {
2836                while block_start < block_end {
2837                    let p = sa[block_start];
2838                    d += SaSint::from(p < 0);
2839                    let p = p & SAINT_MAX;
2840                    let v = buckets_index2(
2841                        t[(p - 1) as usize] as usize,
2842                        usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
2843                    );
2844                    let pos = buckets[induction_offset + v] as usize;
2845                    sa[pos] = (p - 1)
2846                        | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
2847                    buckets[induction_offset + v] += 1;
2848                    buckets[distinct_offset + v] = d;
2849                    block_start += 1;
2850                }
2851            } else {
2852                d = partial_sorting_scan_left_to_right_8u_block_omp(
2853                    t,
2854                    sa,
2855                    k,
2856                    buckets,
2857                    d,
2858                    block_start as FastSint,
2859                    block_size as FastSint,
2860                    threads,
2861                    thread_state,
2862                );
2863                block_start = block_end;
2864            }
2865        }
2866    }
2867
2868    d
2869}
2870
2871/// Internal helper: partial sorting scan left to right 32s 6k.
2872#[doc(hidden)]
2873pub fn partial_sorting_scan_left_to_right_32s_6k(
2874    t: &[SaSint],
2875    sa: &mut [SaSint],
2876    buckets: &mut [SaSint],
2877    mut d: SaSint,
2878    omp_block_start: FastSint,
2879    omp_block_size: FastSint,
2880) -> SaSint {
2881    let prefetch_distance: FastSint = 64;
2882
2883    let mut i = omp_block_start;
2884    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
2885    while i < j {
2886        let mut p0 = sa[i as usize];
2887        d += SaSint::from(p0 < 0);
2888        p0 &= SAINT_MAX;
2889        let p0u = p0 as usize;
2890        let v0 = buckets_index4(t[p0u - 1] as usize, usize::from(t[p0u - 2] >= t[p0u - 1]));
2891        let pos0 = buckets[v0] as usize;
2892        sa[pos0] = (p0 - 1) | (((buckets[2 + v0] != d) as SaSint) << (SAINT_BIT - 1));
2893        buckets[v0] += 1;
2894        buckets[2 + v0] = d;
2895
2896        let mut p1 = sa[(i + 1) as usize];
2897        d += SaSint::from(p1 < 0);
2898        p1 &= SAINT_MAX;
2899        let p1u = p1 as usize;
2900        let v1 = buckets_index4(t[p1u - 1] as usize, usize::from(t[p1u - 2] >= t[p1u - 1]));
2901        let pos1 = buckets[v1] as usize;
2902        sa[pos1] = (p1 - 1) | (((buckets[2 + v1] != d) as SaSint) << (SAINT_BIT - 1));
2903        buckets[v1] += 1;
2904        buckets[2 + v1] = d;
2905
2906        i += 2;
2907    }
2908
2909    j += 2 * prefetch_distance + 1;
2910    while i < j {
2911        let mut p = sa[i as usize];
2912        d += SaSint::from(p < 0);
2913        p &= SAINT_MAX;
2914        let pu = p as usize;
2915        let v = buckets_index4(t[pu - 1] as usize, usize::from(t[pu - 2] >= t[pu - 1]));
2916        let pos = buckets[v] as usize;
2917        sa[pos] = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
2918        buckets[v] += 1;
2919        buckets[2 + v] = d;
2920        i += 1;
2921    }
2922
2923    d
2924}
2925
2926/// Internal helper: partial sorting scan left to right 32s 4k.
2927#[doc(hidden)]
2928pub fn partial_sorting_scan_left_to_right_32s_4k(
2929    t: &[SaSint],
2930    sa: &mut [SaSint],
2931    k: SaSint,
2932    buckets: &mut [SaSint],
2933    mut d: SaSint,
2934    omp_block_start: FastSint,
2935    omp_block_size: FastSint,
2936) -> SaSint {
2937    let k_usize = usize::try_from(k).expect("k must be non-negative");
2938    let prefetch_distance: FastSint = 64;
2939    let induction_offset = 2 * k_usize;
2940    let mut i = omp_block_start;
2941    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
2942
2943    while i < j {
2944        let i0 = i as usize;
2945        let mut p0 = sa[i0];
2946        sa[i0] = p0 & SAINT_MAX;
2947        if p0 > 0 {
2948            sa[i0] = 0;
2949            d += p0 >> (SUFFIX_GROUP_BIT - 1);
2950            p0 &= !SUFFIX_GROUP_MARKER;
2951            let p0u = p0 as usize;
2952            let c0 = t[p0u - 1];
2953            let f0 = usize::from(t[p0u - 2] < c0);
2954            let v0 = buckets_index2(c0 as usize, f0);
2955            let c0u = c0 as usize;
2956            let pos0 = buckets[induction_offset + c0u] as usize;
2957            sa[pos0] = (p0 - 1)
2958                | ((f0 as SaSint) << (SAINT_BIT - 1))
2959                | (((buckets[v0] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
2960            buckets[induction_offset + c0u] += 1;
2961            buckets[v0] = d;
2962        }
2963
2964        let i1 = (i + 1) as usize;
2965        let mut p1 = sa[i1];
2966        sa[i1] = p1 & SAINT_MAX;
2967        if p1 > 0 {
2968            sa[i1] = 0;
2969            d += p1 >> (SUFFIX_GROUP_BIT - 1);
2970            p1 &= !SUFFIX_GROUP_MARKER;
2971            let p1u = p1 as usize;
2972            let c1 = t[p1u - 1];
2973            let f1 = usize::from(t[p1u - 2] < c1);
2974            let v1 = buckets_index2(c1 as usize, f1);
2975            let c1u = c1 as usize;
2976            let pos1 = buckets[induction_offset + c1u] as usize;
2977            sa[pos1] = (p1 - 1)
2978                | ((f1 as SaSint) << (SAINT_BIT - 1))
2979                | (((buckets[v1] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
2980            buckets[induction_offset + c1u] += 1;
2981            buckets[v1] = d;
2982        }
2983
2984        i += 2;
2985    }
2986
2987    j += 2 * prefetch_distance + 1;
2988    while i < j {
2989        let iu = i as usize;
2990        let mut p = sa[iu];
2991        sa[iu] = p & SAINT_MAX;
2992        if p > 0 {
2993            sa[iu] = 0;
2994            d += p >> (SUFFIX_GROUP_BIT - 1);
2995            p &= !SUFFIX_GROUP_MARKER;
2996            let pu = p as usize;
2997            let c = t[pu - 1];
2998            let f = usize::from(t[pu - 2] < c);
2999            let v = buckets_index2(c as usize, f);
3000            let cu = c as usize;
3001            let pos = buckets[induction_offset + cu] as usize;
3002            sa[pos] = (p - 1)
3003                | ((f as SaSint) << (SAINT_BIT - 1))
3004                | (((buckets[v] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3005            buckets[induction_offset + cu] += 1;
3006            buckets[v] = d;
3007        }
3008        i += 1;
3009    }
3010
3011    d
3012}
3013
3014/// Internal helper: partial sorting scan left to right 32s 1k.
3015#[doc(hidden)]
3016pub fn partial_sorting_scan_left_to_right_32s_1k(
3017    t: &[SaSint],
3018    sa: &mut [SaSint],
3019    induction_bucket: &mut [SaSint],
3020    omp_block_start: FastSint,
3021    omp_block_size: FastSint,
3022) {
3023    let prefetch_distance = 64 as FastSint;
3024    let mut i = omp_block_start;
3025    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
3026
3027    while i < j {
3028        let p0 = sa[i as usize];
3029        sa[i as usize] = p0 & SAINT_MAX;
3030        if p0 > 0 {
3031            sa[i as usize] = 0;
3032            let c0 = t[(p0 - 1) as usize] as usize;
3033            let pos0 = induction_bucket[c0] as usize;
3034            induction_bucket[c0] += 1;
3035            sa[pos0] = (p0 - 1)
3036                | ((usize::from(t[(p0 - 2) as usize] < t[(p0 - 1) as usize]) as SaSint)
3037                    << (SAINT_BIT - 1));
3038        }
3039
3040        let p1 = sa[(i + 1) as usize];
3041        sa[(i + 1) as usize] = p1 & SAINT_MAX;
3042        if p1 > 0 {
3043            sa[(i + 1) as usize] = 0;
3044            let c1 = t[(p1 - 1) as usize] as usize;
3045            let pos1 = induction_bucket[c1] as usize;
3046            induction_bucket[c1] += 1;
3047            sa[pos1] = (p1 - 1)
3048                | ((usize::from(t[(p1 - 2) as usize] < t[(p1 - 1) as usize]) as SaSint)
3049                    << (SAINT_BIT - 1));
3050        }
3051
3052        i += 2;
3053    }
3054
3055    j += 2 * prefetch_distance + 1;
3056    while i < j {
3057        let p = sa[i as usize];
3058        sa[i as usize] = p & SAINT_MAX;
3059        if p > 0 {
3060            sa[i as usize] = 0;
3061            let c = t[(p - 1) as usize] as usize;
3062            let pos = induction_bucket[c] as usize;
3063            induction_bucket[c] += 1;
3064            sa[pos] = (p - 1)
3065                | ((usize::from(t[(p - 2) as usize] < t[(p - 1) as usize]) as SaSint)
3066                    << (SAINT_BIT - 1));
3067        }
3068        i += 1;
3069    }
3070}
3071
3072/// Internal helper: partial sorting scan left to right 32s 6k (OpenMP variant).
3073#[doc(hidden)]
3074pub fn partial_sorting_scan_left_to_right_32s_6k_omp(
3075    t: &[SaSint],
3076    sa: &mut [SaSint],
3077    n: SaSint,
3078    buckets: &mut [SaSint],
3079    left_suffixes_count: SaSint,
3080    mut d: SaSint,
3081    threads: SaSint,
3082    thread_state: &mut [ThreadState],
3083) -> SaSint {
3084    let v = buckets_index4(
3085        t[(n - 1) as usize] as usize,
3086        usize::from(t[(n - 2) as usize] >= t[(n - 1) as usize]),
3087    );
3088    let pos = buckets[v] as usize;
3089    sa[pos] = (n - 1) | SAINT_MIN;
3090    buckets[v] += 1;
3091    d += 1;
3092    buckets[2 + v] = d;
3093    if threads == 1 || left_suffixes_count < 65_536 {
3094        return partial_sorting_scan_left_to_right_32s_6k(
3095            t,
3096            sa,
3097            buckets,
3098            d,
3099            0,
3100            left_suffixes_count as FastSint,
3101        );
3102    }
3103    if thread_state.is_empty() {
3104        return partial_sorting_scan_left_to_right_32s_6k(
3105            t,
3106            sa,
3107            buckets,
3108            d,
3109            0,
3110            left_suffixes_count as FastSint,
3111        );
3112    }
3113
3114    let left_suffixes_count =
3115        usize::try_from(left_suffixes_count).expect("left_suffixes_count must be non-negative");
3116    let threads_usize = usize::try_from(threads)
3117        .expect("threads must be non-negative")
3118        .max(1);
3119    let mut block_start = 0usize;
3120    let block_span = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3121    let mut cache = vec![ThreadCache::default(); block_span];
3122    while block_start < left_suffixes_count {
3123        let mut block_end = block_start + block_span;
3124        if block_end > left_suffixes_count {
3125            block_end = left_suffixes_count;
3126        }
3127
3128        d = partial_sorting_scan_left_to_right_32s_6k_block_omp(
3129            t,
3130            sa,
3131            buckets,
3132            d,
3133            &mut cache,
3134            block_start as FastSint,
3135            (block_end - block_start) as FastSint,
3136            threads,
3137        );
3138
3139        block_start = block_end;
3140    }
3141
3142    d
3143}
3144
3145/// Internal helper: partial sorting scan left to right 32s 4k (OpenMP variant).
3146#[doc(hidden)]
3147pub fn partial_sorting_scan_left_to_right_32s_4k_omp(
3148    t: &[SaSint],
3149    sa: &mut [SaSint],
3150    n: SaSint,
3151    k: SaSint,
3152    buckets: &mut [SaSint],
3153    mut d: SaSint,
3154    threads: SaSint,
3155    thread_state: &mut [ThreadState],
3156) -> SaSint {
3157    let k_usize = usize::try_from(k).expect("k must be non-negative");
3158    let induction_offset = 2 * k_usize;
3159    let distinct_offset = 0usize;
3160    let symbol = t[(n - 1) as usize] as usize;
3161    let is_s = usize::from(t[(n - 2) as usize] < t[(n - 1) as usize]);
3162    let pos = buckets[induction_offset + symbol] as usize;
3163    sa[pos] = (n - 1) | ((is_s as SaSint) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
3164    buckets[induction_offset + symbol] += 1;
3165    d += 1;
3166    buckets[distinct_offset + buckets_index2(symbol, is_s)] = d;
3167
3168    if threads == 1 || n < 65_536 {
3169        d = partial_sorting_scan_left_to_right_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
3170    } else {
3171        if thread_state.is_empty() {
3172            return partial_sorting_scan_left_to_right_32s_4k(
3173                t,
3174                sa,
3175                k,
3176                buckets,
3177                d,
3178                0,
3179                n as FastSint,
3180            );
3181        }
3182        let mut block_start = 0usize;
3183        let n_usize = usize::try_from(n).expect("n must be non-negative");
3184        let threads_usize = usize::try_from(threads)
3185            .expect("threads must be non-negative")
3186            .max(1);
3187        let chunk_capacity = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3188        let mut cache = vec![ThreadCache::default(); chunk_capacity];
3189
3190        while block_start < n_usize {
3191            let mut block_end = block_start + chunk_capacity;
3192            if block_end > n_usize {
3193                block_end = n_usize;
3194            }
3195
3196            d = partial_sorting_scan_left_to_right_32s_4k_block_omp(
3197                t,
3198                sa,
3199                k,
3200                buckets,
3201                d,
3202                &mut cache,
3203                block_start as FastSint,
3204                (block_end - block_start) as FastSint,
3205                threads,
3206            );
3207
3208            block_start = block_end;
3209        }
3210    }
3211
3212    d
3213}
3214
3215/// Internal helper: partial sorting scan left to right 32s 1k (OpenMP variant).
3216#[doc(hidden)]
3217pub fn partial_sorting_scan_left_to_right_32s_1k_omp(
3218    t: &[SaSint],
3219    sa: &mut [SaSint],
3220    n: SaSint,
3221    buckets: &mut [SaSint],
3222    threads: SaSint,
3223    thread_state: &mut [ThreadState],
3224) {
3225    let symbol = t[(n - 1) as usize] as usize;
3226    let pos = buckets[symbol] as usize;
3227    sa[pos] = (n - 1)
3228        | ((usize::from(t[(n - 2) as usize] < t[(n - 1) as usize]) as SaSint) << (SAINT_BIT - 1));
3229    buckets[symbol] += 1;
3230    if threads == 1 || n < 65_536 {
3231        partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, 0, n as FastSint);
3232    } else {
3233        if thread_state.is_empty() {
3234            partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, 0, n as FastSint);
3235            return;
3236        }
3237        let n_usize = usize::try_from(n).expect("n must be non-negative");
3238        let threads_usize = usize::try_from(threads)
3239            .expect("threads must be non-negative")
3240            .max(1);
3241        let mut block_start = 0usize;
3242        let block_span = threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE;
3243        let mut cache = vec![ThreadCache::default(); block_span];
3244
3245        while block_start < n_usize {
3246            let mut block_end = block_start + block_span;
3247            if block_end > n_usize {
3248                block_end = n_usize;
3249            }
3250
3251            partial_sorting_scan_left_to_right_32s_1k_block_omp(
3252                t,
3253                sa,
3254                buckets,
3255                &mut cache,
3256                block_start as FastSint,
3257                (block_end - block_start) as FastSint,
3258                threads,
3259            );
3260
3261            block_start = block_end;
3262        }
3263    }
3264}
3265
3266/// Internal helper: partial sorting scan left to right 8u block prepare.
3267#[doc(hidden)]
3268pub fn partial_sorting_scan_left_to_right_8u_block_prepare(
3269    t: &[u8],
3270    sa: &[SaSint],
3271    k: SaSint,
3272    buckets: &mut [SaSint],
3273    cache: &mut [ThreadCache],
3274    omp_block_start: FastSint,
3275    omp_block_size: FastSint,
3276) -> (FastSint, FastSint) {
3277    let k_usize = usize::try_from(k).expect("k must be non-negative");
3278    buckets[..2 * k_usize].fill(0);
3279    buckets[2 * k_usize..4 * k_usize].fill(0);
3280
3281    let mut i = omp_block_start;
3282    let mut j = omp_block_start + omp_block_size - 65;
3283    let mut count = 0usize;
3284    let mut d: SaSint = 1;
3285
3286    while i < j {
3287        let mut p0 = sa[i as usize];
3288        cache[count].index = p0;
3289        d += SaSint::from(p0 < 0);
3290        p0 &= SAINT_MAX;
3291        let v0 = buckets_index2(
3292            t[(p0 - 1) as usize] as usize,
3293            usize::from(t[(p0 - 2) as usize] >= t[(p0 - 1) as usize]),
3294        );
3295        cache[count].symbol = v0 as SaSint;
3296        count += 1;
3297        buckets[v0] += 1;
3298        buckets[2 * k_usize + v0] = d;
3299
3300        let mut p1 = sa[(i + 1) as usize];
3301        cache[count].index = p1;
3302        d += SaSint::from(p1 < 0);
3303        p1 &= SAINT_MAX;
3304        let v1 = buckets_index2(
3305            t[(p1 - 1) as usize] as usize,
3306            usize::from(t[(p1 - 2) as usize] >= t[(p1 - 1) as usize]),
3307        );
3308        cache[count].symbol = v1 as SaSint;
3309        count += 1;
3310        buckets[v1] += 1;
3311        buckets[2 * k_usize + v1] = d;
3312
3313        i += 2;
3314    }
3315
3316    j += 65;
3317    while i < j {
3318        let mut p = sa[i as usize];
3319        cache[count].index = p;
3320        d += SaSint::from(p < 0);
3321        p &= SAINT_MAX;
3322        let v = buckets_index2(
3323            t[(p - 1) as usize] as usize,
3324            usize::from(t[(p - 2) as usize] >= t[(p - 1) as usize]),
3325        );
3326        cache[count].symbol = v as SaSint;
3327        count += 1;
3328        buckets[v] += 1;
3329        buckets[2 * k_usize + v] = d;
3330        i += 1;
3331    }
3332
3333    (d as FastSint - 1, count as FastSint)
3334}
3335
3336/// Internal helper: partial sorting scan left to right 8u block place.
3337#[doc(hidden)]
3338pub fn partial_sorting_scan_left_to_right_8u_block_place(
3339    sa: &mut [SaSint],
3340    buckets: &mut [SaSint],
3341    k: SaSint,
3342    cache: &[ThreadCache],
3343    count: FastSint,
3344    mut d: SaSint,
3345) {
3346    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3347    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3348
3349    let mut i = 0usize;
3350    let mut j = usize::try_from(count)
3351        .expect("count must be non-negative")
3352        .saturating_sub(1);
3353    while i < j {
3354        let p0 = cache[i].index;
3355        d += SaSint::from(p0 < 0);
3356        let v0 = cache[i].symbol as usize;
3357        let pos0 = induction_bucket[v0] as usize;
3358        sa[pos0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3359        induction_bucket[v0] += 1;
3360        distinct_names[v0] = d;
3361
3362        let p1 = cache[i + 1].index;
3363        d += SaSint::from(p1 < 0);
3364        let v1 = cache[i + 1].symbol as usize;
3365        let pos1 = induction_bucket[v1] as usize;
3366        sa[pos1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3367        induction_bucket[v1] += 1;
3368        distinct_names[v1] = d;
3369
3370        i += 2;
3371    }
3372
3373    j += 1;
3374    while i < j {
3375        let p = cache[i].index;
3376        d += SaSint::from(p < 0);
3377        let v = cache[i].symbol as usize;
3378        let pos = induction_bucket[v] as usize;
3379        sa[pos] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3380        induction_bucket[v] += 1;
3381        distinct_names[v] = d;
3382        i += 1;
3383    }
3384}
3385
3386/// Internal helper: partial sorting scan left to right 8u block (OpenMP variant).
3387#[doc(hidden)]
3388pub fn partial_sorting_scan_left_to_right_8u_block_omp(
3389    t: &[u8],
3390    sa: &mut [SaSint],
3391    k: SaSint,
3392    buckets: &mut [SaSint],
3393    d: SaSint,
3394    block_start: FastSint,
3395    block_size: FastSint,
3396    threads: SaSint,
3397    thread_state: &mut [ThreadState],
3398) -> SaSint {
3399    let mut d = d;
3400    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
3401    let k_usize = usize::try_from(k).expect("k must be non-negative");
3402    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
3403        usize::try_from(threads)
3404            .expect("threads must be non-negative")
3405            .min(thread_state.len())
3406            .max(1)
3407    } else {
3408        1
3409    };
3410    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
3411
3412    if omp_num_threads == 1 {
3413        return partial_sorting_scan_left_to_right_8u(t, sa, buckets, d, block_start, block_size);
3414    }
3415
3416    for omp_thread_num in 0..omp_num_threads {
3417        let mut omp_block_start = omp_thread_num * omp_block_stride;
3418        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
3419            omp_block_stride
3420        } else {
3421            block_size_usize - omp_block_start
3422        };
3423        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
3424
3425        let state = &mut thread_state[omp_thread_num];
3426        let (position, count) = partial_sorting_scan_left_to_right_8u_block_prepare(
3427            t,
3428            sa,
3429            k,
3430            &mut state.buckets,
3431            &mut state.cache,
3432            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
3433            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
3434        );
3435        state.position = position;
3436        state.count = count;
3437    }
3438
3439    let induction_offset = 4 * ALPHABET_SIZE;
3440    let distinct_offset = 2 * ALPHABET_SIZE;
3441    let (prefix, induction_tail) = buckets.split_at_mut(induction_offset);
3442    let induction_bucket = &mut induction_tail[..2 * k_usize];
3443    let distinct_names = &mut prefix[distinct_offset..distinct_offset + 2 * k_usize];
3444
3445    for tnum in 0..omp_num_threads {
3446        let state = &mut thread_state[tnum];
3447        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
3448        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
3449
3450        for c in 0..2 * k_usize {
3451            let a = induction_bucket[c];
3452            let b = temp_induction_bucket[c];
3453            induction_bucket[c] = a + b;
3454            temp_induction_bucket[c] = a;
3455        }
3456
3457        d -= 1;
3458        for c in 0..2 * k_usize {
3459            let a = distinct_names[c];
3460            let b = temp_distinct_names[c];
3461            let next_d = b + d;
3462            distinct_names[c] = if b > 0 { next_d } else { a };
3463            temp_distinct_names[c] = a;
3464        }
3465        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
3466        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
3467    }
3468
3469    for tnum in 0..omp_num_threads {
3470        let state = &mut thread_state[tnum];
3471        partial_sorting_scan_left_to_right_8u_block_place(
3472            sa,
3473            &mut state.buckets,
3474            k,
3475            &state.cache,
3476            state.count,
3477            state.position as SaSint,
3478        );
3479    }
3480
3481    d
3482}
3483
3484/// Internal helper: partial sorting shift markers 8u (OpenMP variant).
3485#[doc(hidden)]
3486pub fn partial_sorting_shift_markers_8u_omp(
3487    sa: &mut [SaSint],
3488    n: SaSint,
3489    buckets: &[SaSint],
3490    threads: SaSint,
3491) {
3492    let temp_bucket = &buckets[4 * ALPHABET_SIZE..];
3493    let thread_count = if threads > 1 && n >= 65536 {
3494        usize::try_from(threads).expect("threads must be positive")
3495    } else {
3496        1
3497    };
3498    let c_step = buckets_index2(1, 0) as isize;
3499    let c_min = buckets_index2(1, 0) as isize;
3500    let c_max = buckets_index2(ALPHABET_SIZE - 1, 0) as isize;
3501    for t in 0..thread_count {
3502        let mut c = c_max - (t as isize * c_step);
3503        while c >= c_min {
3504            let c_usize = c as usize;
3505            let mut i = temp_bucket[c_usize] as isize - 1;
3506            let mut j = buckets[c_usize - buckets_index2(1, 0)] as isize + 3;
3507            let mut s = SAINT_MIN;
3508
3509            while i >= j {
3510                let p0 = sa[i as usize];
3511                let q0 = (p0 & SAINT_MIN) ^ s;
3512                s ^= q0;
3513                sa[i as usize] = p0 ^ q0;
3514
3515                let p1 = sa[(i - 1) as usize];
3516                let q1 = (p1 & SAINT_MIN) ^ s;
3517                s ^= q1;
3518                sa[(i - 1) as usize] = p1 ^ q1;
3519
3520                let p2 = sa[(i - 2) as usize];
3521                let q2 = (p2 & SAINT_MIN) ^ s;
3522                s ^= q2;
3523                sa[(i - 2) as usize] = p2 ^ q2;
3524
3525                let p3 = sa[(i - 3) as usize];
3526                let q3 = (p3 & SAINT_MIN) ^ s;
3527                s ^= q3;
3528                sa[(i - 3) as usize] = p3 ^ q3;
3529
3530                i -= 4;
3531            }
3532
3533            j -= 3;
3534            while i >= j {
3535                let p = sa[i as usize];
3536                let q = (p & SAINT_MIN) ^ s;
3537                s ^= q;
3538                sa[i as usize] = p ^ q;
3539                i -= 1;
3540            }
3541
3542            c -= c_step * thread_count as isize;
3543        }
3544    }
3545}
3546
3547/// Internal helper: partial sorting shift markers 32s 6k (OpenMP variant).
3548#[doc(hidden)]
3549pub fn partial_sorting_shift_markers_32s_6k_omp(
3550    sa: &mut [SaSint],
3551    k: SaSint,
3552    buckets: &[SaSint],
3553    threads: SaSint,
3554) {
3555    let k_usize = usize::try_from(k).expect("k must be non-negative");
3556    let temp_bucket = &buckets[4 * k_usize..];
3557    let thread_count = if threads > 1 && k >= 65536 {
3558        usize::try_from(threads).expect("threads must be positive")
3559    } else {
3560        1
3561    };
3562    for t in 0..thread_count {
3563        let mut c = k_usize as isize - 1 - t as isize;
3564        while c >= 1 {
3565            let c_usize = c as usize;
3566            let mut i = buckets[buckets_index4(c_usize, 0)] as isize - 1;
3567            let mut j = temp_bucket[buckets_index2(c_usize - 1, 0)] as isize + 3;
3568            let mut s = SAINT_MIN;
3569
3570            while i >= j {
3571                let p0 = sa[i as usize];
3572                let q0 = (p0 & SAINT_MIN) ^ s;
3573                s ^= q0;
3574                sa[i as usize] = p0 ^ q0;
3575
3576                let p1 = sa[(i - 1) as usize];
3577                let q1 = (p1 & SAINT_MIN) ^ s;
3578                s ^= q1;
3579                sa[(i - 1) as usize] = p1 ^ q1;
3580
3581                let p2 = sa[(i - 2) as usize];
3582                let q2 = (p2 & SAINT_MIN) ^ s;
3583                s ^= q2;
3584                sa[(i - 2) as usize] = p2 ^ q2;
3585
3586                let p3 = sa[(i - 3) as usize];
3587                let q3 = (p3 & SAINT_MIN) ^ s;
3588                s ^= q3;
3589                sa[(i - 3) as usize] = p3 ^ q3;
3590
3591                i -= 4;
3592            }
3593
3594            j -= 3;
3595            while i >= j {
3596                let p = sa[i as usize];
3597                let q = (p & SAINT_MIN) ^ s;
3598                s ^= q;
3599                sa[i as usize] = p ^ q;
3600                i -= 1;
3601            }
3602
3603            c -= thread_count as isize;
3604        }
3605    }
3606}
3607
3608/// Internal helper: partial sorting shift markers 32s 4k.
3609#[doc(hidden)]
3610pub fn partial_sorting_shift_markers_32s_4k(sa: &mut [SaSint], n: SaSint) {
3611    let mut i = n as isize - 1;
3612    let mut s = SUFFIX_GROUP_MARKER;
3613    while i >= 3 {
3614        let p0 = sa[i as usize];
3615        let q0 =
3616            ((p0 & SUFFIX_GROUP_MARKER) ^ s) & (((p0 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3617        s ^= q0;
3618        sa[i as usize] = p0 ^ q0;
3619
3620        let p1 = sa[(i - 1) as usize];
3621        let q1 =
3622            ((p1 & SUFFIX_GROUP_MARKER) ^ s) & (((p1 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3623        s ^= q1;
3624        sa[(i - 1) as usize] = p1 ^ q1;
3625
3626        let p2 = sa[(i - 2) as usize];
3627        let q2 =
3628            ((p2 & SUFFIX_GROUP_MARKER) ^ s) & (((p2 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3629        s ^= q2;
3630        sa[(i - 2) as usize] = p2 ^ q2;
3631
3632        let p3 = sa[(i - 3) as usize];
3633        let q3 =
3634            ((p3 & SUFFIX_GROUP_MARKER) ^ s) & (((p3 > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3635        s ^= q3;
3636        sa[(i - 3) as usize] = p3 ^ q3;
3637
3638        i -= 4;
3639    }
3640
3641    while i >= 0 {
3642        let p = sa[i as usize];
3643        let q = ((p & SUFFIX_GROUP_MARKER) ^ s) & (((p > 0) as SaSint) << (SUFFIX_GROUP_BIT - 1));
3644        s ^= q;
3645        sa[i as usize] = p ^ q;
3646        i -= 1;
3647    }
3648}
3649
3650/// Internal helper: partial sorting shift buckets 32s 6k.
3651#[doc(hidden)]
3652pub fn partial_sorting_shift_buckets_32s_6k(k: SaSint, buckets: &mut [SaSint]) {
3653    let k_usize = usize::try_from(k).expect("k must be non-negative");
3654    let temp_offset = 4 * k_usize;
3655    for i in 0..k_usize {
3656        let src = buckets_index2(i, 0);
3657        let dst = 2 * src;
3658        buckets[dst] = buckets[temp_offset + src];
3659        buckets[dst + 1] = buckets[temp_offset + src + 1];
3660    }
3661}
3662
3663/// Internal helper: partial sorting scan right to left 8u.
3664#[doc(hidden)]
3665pub fn partial_sorting_scan_right_to_left_8u(
3666    t: &[u8],
3667    sa: &mut [SaSint],
3668    buckets: &mut [SaSint],
3669    mut d: SaSint,
3670    omp_block_start: FastSint,
3671    omp_block_size: FastSint,
3672) -> SaSint {
3673    if omp_block_size <= 0 {
3674        return d;
3675    }
3676
3677    let prefetch_distance = 64usize;
3678    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * ALPHABET_SIZE);
3679    let distinct_names = &mut distinct_names_all[..2 * ALPHABET_SIZE];
3680
3681    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3682    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3683    let mut i = start + size - 1;
3684    let mut j = start + prefetch_distance + 1;
3685
3686    while i >= j {
3687        let mut p0 = sa[i];
3688        d += SaSint::from(p0 < 0);
3689        p0 &= SAINT_MAX;
3690
3691        let p0_usize = p0 as usize;
3692        let v0 = buckets_index2(
3693            t[p0_usize - 1] as usize,
3694            usize::from(t[p0_usize - 2] > t[p0_usize - 1]),
3695        );
3696
3697        induction_bucket[v0] -= 1;
3698        let slot0 = induction_bucket[v0] as usize;
3699        sa[slot0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3700        distinct_names[v0] = d;
3701
3702        let mut p1 = sa[i - 1];
3703        d += SaSint::from(p1 < 0);
3704        p1 &= SAINT_MAX;
3705
3706        let p1_usize = p1 as usize;
3707        let v1 = buckets_index2(
3708            t[p1_usize - 1] as usize,
3709            usize::from(t[p1_usize - 2] > t[p1_usize - 1]),
3710        );
3711
3712        induction_bucket[v1] -= 1;
3713        let slot1 = induction_bucket[v1] as usize;
3714        sa[slot1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3715        distinct_names[v1] = d;
3716
3717        i -= 2;
3718    }
3719
3720    j = if start + prefetch_distance < start + size {
3721        start
3722    } else {
3723        start
3724    };
3725    while i >= j {
3726        let mut p = sa[i];
3727        d += SaSint::from(p < 0);
3728        p &= SAINT_MAX;
3729
3730        let p_usize = p as usize;
3731        let v = buckets_index2(
3732            t[p_usize - 1] as usize,
3733            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3734        );
3735
3736        induction_bucket[v] -= 1;
3737        let slot = induction_bucket[v] as usize;
3738        sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3739        distinct_names[v] = d;
3740
3741        if i == 0 {
3742            break;
3743        }
3744        i -= 1;
3745    }
3746
3747    d
3748}
3749
3750/// Internal helper: partial gsa scan right to left 8u.
3751#[doc(hidden)]
3752pub fn partial_gsa_scan_right_to_left_8u(
3753    t: &[u8],
3754    sa: &mut [SaSint],
3755    buckets: &mut [SaSint],
3756    mut d: SaSint,
3757    omp_block_start: FastSint,
3758    omp_block_size: FastSint,
3759) -> SaSint {
3760    if omp_block_size <= 0 {
3761        return d;
3762    }
3763
3764    let prefetch_distance = 64usize;
3765    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * ALPHABET_SIZE);
3766    let distinct_names = &mut distinct_names_all[..2 * ALPHABET_SIZE];
3767
3768    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3769    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3770    let mut i = start + size - 1;
3771    let mut j = start + prefetch_distance + 1;
3772
3773    while i >= j {
3774        let mut p0 = sa[i];
3775        d += SaSint::from(p0 < 0);
3776        p0 &= SAINT_MAX;
3777
3778        let p0_usize = p0 as usize;
3779        let v0 = buckets_index2(
3780            t[p0_usize - 1] as usize,
3781            usize::from(t[p0_usize - 2] > t[p0_usize - 1]),
3782        );
3783
3784        if v0 != 1 {
3785            induction_bucket[v0] -= 1;
3786            let slot0 = induction_bucket[v0] as usize;
3787            sa[slot0] = (p0 - 1) | (((distinct_names[v0] != d) as SaSint) << (SAINT_BIT - 1));
3788            distinct_names[v0] = d;
3789        }
3790
3791        let mut p1 = sa[i - 1];
3792        d += SaSint::from(p1 < 0);
3793        p1 &= SAINT_MAX;
3794
3795        let p1_usize = p1 as usize;
3796        let v1 = buckets_index2(
3797            t[p1_usize - 1] as usize,
3798            usize::from(t[p1_usize - 2] > t[p1_usize - 1]),
3799        );
3800
3801        if v1 != 1 {
3802            induction_bucket[v1] -= 1;
3803            let slot1 = induction_bucket[v1] as usize;
3804            sa[slot1] = (p1 - 1) | (((distinct_names[v1] != d) as SaSint) << (SAINT_BIT - 1));
3805            distinct_names[v1] = d;
3806        }
3807
3808        i -= 2;
3809    }
3810
3811    j = start;
3812    while i >= j {
3813        let mut p = sa[i];
3814        d += SaSint::from(p < 0);
3815        p &= SAINT_MAX;
3816
3817        let p_usize = p as usize;
3818        let v = buckets_index2(
3819            t[p_usize - 1] as usize,
3820            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3821        );
3822
3823        if v != 1 {
3824            induction_bucket[v] -= 1;
3825            let slot = induction_bucket[v] as usize;
3826            sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3827            distinct_names[v] = d;
3828        }
3829
3830        if i == 0 {
3831            break;
3832        }
3833        i -= 1;
3834    }
3835
3836    d
3837}
3838
3839/// Internal helper: partial sorting scan right to left 8u block prepare.
3840#[doc(hidden)]
3841pub fn partial_sorting_scan_right_to_left_8u_block_prepare(
3842    t: &[u8],
3843    sa: &[SaSint],
3844    k: SaSint,
3845    buckets: &mut [SaSint],
3846    cache: &mut [ThreadCache],
3847    omp_block_start: FastSint,
3848    omp_block_size: FastSint,
3849) -> (FastSint, FastSint) {
3850    let k_usize = usize::try_from(k).expect("k must be non-negative");
3851    let (induction_bucket, distinct_names_all) = buckets.split_at_mut(2 * k_usize);
3852    let distinct_names = &mut distinct_names_all[..2 * k_usize];
3853    induction_bucket.fill(0);
3854    distinct_names.fill(0);
3855
3856    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
3857    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
3858    let mut count = 0usize;
3859    let mut d = 1;
3860
3861    let mut i = start + size;
3862    while i > start {
3863        i -= 1;
3864
3865        let mut p = sa[i];
3866        cache[count].index = p;
3867        d += SaSint::from(p < 0);
3868        p &= SAINT_MAX;
3869
3870        let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
3871        let v = buckets_index2(
3872            t[p_usize - 1] as usize,
3873            usize::from(t[p_usize - 2] > t[p_usize - 1]),
3874        );
3875
3876        cache[count].symbol = v as SaSint;
3877        induction_bucket[v] += 1;
3878        distinct_names[v] = d;
3879        count += 1;
3880    }
3881
3882    ((d - 1) as FastSint, count as FastSint)
3883}
3884
3885/// Internal helper: partial sorting scan right to left 8u block place.
3886#[doc(hidden)]
3887pub fn partial_sorting_scan_right_to_left_8u_block_place(
3888    sa: &mut [SaSint],
3889    buckets: &mut [SaSint],
3890    k: SaSint,
3891    cache: &[ThreadCache],
3892    count: FastSint,
3893    mut d: SaSint,
3894) {
3895    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3896    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3897
3898    let count = usize::try_from(count).expect("count must be non-negative");
3899    for entry in &cache[..count] {
3900        let p = entry.index;
3901        d += SaSint::from(p < 0);
3902        let v = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
3903        induction_bucket[v] -= 1;
3904        let slot = usize::try_from(induction_bucket[v]).expect("bucket slot must be non-negative");
3905        sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3906        distinct_names[v] = d;
3907    }
3908}
3909
3910/// Internal helper: partial gsa scan right to left 8u block place.
3911#[doc(hidden)]
3912pub fn partial_gsa_scan_right_to_left_8u_block_place(
3913    sa: &mut [SaSint],
3914    buckets: &mut [SaSint],
3915    k: SaSint,
3916    cache: &[ThreadCache],
3917    count: FastSint,
3918    mut d: SaSint,
3919) {
3920    let split = 2 * usize::try_from(k).expect("k must be non-negative");
3921    let (induction_bucket, distinct_names) = buckets.split_at_mut(split);
3922
3923    let count = usize::try_from(count).expect("count must be non-negative");
3924    for entry in &cache[..count] {
3925        let p = entry.index;
3926        d += SaSint::from(p < 0);
3927        let v = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
3928        if v != 1 {
3929            induction_bucket[v] -= 1;
3930            let slot =
3931                usize::try_from(induction_bucket[v]).expect("bucket slot must be non-negative");
3932            sa[slot] = (p - 1) | (((distinct_names[v] != d) as SaSint) << (SAINT_BIT - 1));
3933            distinct_names[v] = d;
3934        }
3935    }
3936}
3937
3938/// Internal helper: partial sorting scan right to left 8u block (OpenMP variant).
3939#[doc(hidden)]
3940pub fn partial_sorting_scan_right_to_left_8u_block_omp(
3941    t: &[u8],
3942    sa: &mut [SaSint],
3943    k: SaSint,
3944    buckets: &mut [SaSint],
3945    d: SaSint,
3946    block_start: FastSint,
3947    block_size: FastSint,
3948    threads: SaSint,
3949    thread_state: &mut [ThreadState],
3950) -> SaSint {
3951    let mut d = d;
3952    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
3953    let k_usize = usize::try_from(k).expect("k must be non-negative");
3954    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
3955        usize::try_from(threads)
3956            .expect("threads must be non-negative")
3957            .min(thread_state.len())
3958            .max(1)
3959    } else {
3960        1
3961    };
3962    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
3963
3964    if omp_num_threads == 1 {
3965        return partial_sorting_scan_right_to_left_8u(t, sa, buckets, d, block_start, block_size);
3966    }
3967
3968    for omp_thread_num in 0..omp_num_threads {
3969        let mut omp_block_start = omp_thread_num * omp_block_stride;
3970        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
3971            omp_block_stride
3972        } else {
3973            block_size_usize - omp_block_start
3974        };
3975        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
3976
3977        let state = &mut thread_state[omp_thread_num];
3978        let (position, count) = partial_sorting_scan_right_to_left_8u_block_prepare(
3979            t,
3980            sa,
3981            k,
3982            &mut state.buckets,
3983            &mut state.cache,
3984            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
3985            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
3986        );
3987        state.position = position;
3988        state.count = count;
3989    }
3990
3991    let distinct_offset = 2 * ALPHABET_SIZE;
3992    let (induction_bucket, distinct_tail) = buckets.split_at_mut(distinct_offset);
3993    let distinct_names = &mut distinct_tail[..2 * k_usize];
3994
3995    for tnum in (0..omp_num_threads).rev() {
3996        let state = &mut thread_state[tnum];
3997        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
3998        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
3999
4000        for c in 0..2 * k_usize {
4001            let a = induction_bucket[c];
4002            let b = temp_induction_bucket[c];
4003            induction_bucket[c] = a - b;
4004            temp_induction_bucket[c] = a;
4005        }
4006
4007        d -= 1;
4008        for c in 0..2 * k_usize {
4009            let a = distinct_names[c];
4010            let b = temp_distinct_names[c];
4011            let next_d = b + d;
4012            distinct_names[c] = if b > 0 { next_d } else { a };
4013            temp_distinct_names[c] = a;
4014        }
4015        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
4016        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
4017    }
4018
4019    for tnum in 0..omp_num_threads {
4020        let state = &mut thread_state[tnum];
4021        partial_sorting_scan_right_to_left_8u_block_place(
4022            sa,
4023            &mut state.buckets,
4024            k,
4025            &state.cache,
4026            state.count,
4027            state.position as SaSint,
4028        );
4029    }
4030
4031    d
4032}
4033
4034/// Internal helper: partial gsa scan right to left 8u block (OpenMP variant).
4035#[doc(hidden)]
4036pub fn partial_gsa_scan_right_to_left_8u_block_omp(
4037    t: &[u8],
4038    sa: &mut [SaSint],
4039    k: SaSint,
4040    buckets: &mut [SaSint],
4041    d: SaSint,
4042    block_start: FastSint,
4043    block_size: FastSint,
4044    threads: SaSint,
4045    thread_state: &mut [ThreadState],
4046) -> SaSint {
4047    let mut d = d;
4048    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4049    let k_usize = usize::try_from(k).expect("k must be non-negative");
4050    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
4051        usize::try_from(threads)
4052            .expect("threads must be non-negative")
4053            .min(thread_state.len())
4054            .max(1)
4055    } else {
4056        1
4057    };
4058    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4059
4060    if omp_num_threads == 1 {
4061        return partial_gsa_scan_right_to_left_8u(t, sa, buckets, d, block_start, block_size);
4062    }
4063
4064    for omp_thread_num in 0..omp_num_threads {
4065        let mut omp_block_start = omp_thread_num * omp_block_stride;
4066        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4067            omp_block_stride
4068        } else {
4069            block_size_usize - omp_block_start
4070        };
4071        omp_block_start += usize::try_from(block_start).expect("block_start must be non-negative");
4072
4073        let state = &mut thread_state[omp_thread_num];
4074        let (position, count) = partial_sorting_scan_right_to_left_8u_block_prepare(
4075            t,
4076            sa,
4077            k,
4078            &mut state.buckets,
4079            &mut state.cache,
4080            FastSint::try_from(omp_block_start).expect("block start must fit FastSint"),
4081            FastSint::try_from(omp_block_size).expect("block size must fit FastSint"),
4082        );
4083        state.position = position;
4084        state.count = count;
4085    }
4086
4087    let distinct_offset = 2 * ALPHABET_SIZE;
4088    let (induction_bucket, distinct_tail) = buckets.split_at_mut(distinct_offset);
4089    let distinct_names = &mut distinct_tail[..2 * k_usize];
4090
4091    for tnum in (0..omp_num_threads).rev() {
4092        let state = &mut thread_state[tnum];
4093        let (temp_induction_bucket, temp_tail) = state.buckets.split_at_mut(2 * k_usize);
4094        let temp_distinct_names = &mut temp_tail[..2 * k_usize];
4095
4096        for c in 0..2 * k_usize {
4097            let a = induction_bucket[c];
4098            let b = temp_induction_bucket[c];
4099            induction_bucket[c] = a - b;
4100            temp_induction_bucket[c] = a;
4101        }
4102
4103        d -= 1;
4104        for c in 0..2 * k_usize {
4105            let a = distinct_names[c];
4106            let b = temp_distinct_names[c];
4107            let next_d = b + d;
4108            distinct_names[c] = if b > 0 { next_d } else { a };
4109            temp_distinct_names[c] = a;
4110        }
4111        d += 1 + SaSint::try_from(state.position).expect("position must fit SaSint");
4112        state.position = FastSint::try_from(d).expect("d must fit FastSint") - state.position;
4113    }
4114
4115    for tnum in 0..omp_num_threads {
4116        let state = &mut thread_state[tnum];
4117        partial_gsa_scan_right_to_left_8u_block_place(
4118            sa,
4119            &mut state.buckets,
4120            k,
4121            &state.cache,
4122            state.count,
4123            state.position as SaSint,
4124        );
4125    }
4126
4127    d
4128}
4129
4130/// Internal helper: partial sorting scan right to left 8u (OpenMP variant).
4131#[doc(hidden)]
4132pub fn partial_sorting_scan_right_to_left_8u_omp(
4133    t: &[u8],
4134    sa: &mut [SaSint],
4135    n: SaSint,
4136    k: SaSint,
4137    buckets: &mut [SaSint],
4138    first_lms_suffix: SaSint,
4139    left_suffixes_count: SaSint,
4140    mut d: SaSint,
4141    threads: SaSint,
4142    thread_state: &mut [ThreadState],
4143) {
4144    let scan_start = left_suffixes_count as FastSint + 1;
4145    let scan_end = n as FastSint - first_lms_suffix as FastSint;
4146
4147    if threads == 1 || (scan_end - scan_start) < 65_536 {
4148        let _ = partial_sorting_scan_right_to_left_8u(
4149            t,
4150            sa,
4151            buckets,
4152            d,
4153            scan_start,
4154            scan_end - scan_start,
4155        );
4156        return;
4157    }
4158
4159    let distinct_offset = 2 * ALPHABET_SIZE;
4160
4161    let mut block_start = usize::try_from(scan_end - 1).expect("scan end must be positive");
4162    let scan_start_usize = usize::try_from(scan_start).expect("scan_start must be non-negative");
4163    let threads_usize = usize::try_from(threads)
4164        .expect("threads must be non-negative")
4165        .min(thread_state.len())
4166        .max(1);
4167
4168    while block_start >= scan_start_usize {
4169        if sa[block_start] == 0 {
4170            if block_start == 0 {
4171                break;
4172            }
4173            block_start -= 1;
4174        } else {
4175            let mut block_max_end = block_start.saturating_sub(
4176                threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize),
4177            );
4178            if block_max_end + 1 < scan_start_usize {
4179                block_max_end = scan_start_usize.saturating_sub(1);
4180            }
4181            let mut block_end = block_start - 1;
4182            while block_end > block_max_end && sa[block_end] != 0 {
4183                block_end -= 1;
4184            }
4185            let block_size = block_start - block_end;
4186
4187            if block_size < 32 {
4188                while block_start > block_end {
4189                    let p = sa[block_start];
4190                    d += SaSint::from(p < 0);
4191                    let p = p & SAINT_MAX;
4192                    let v = buckets_index2(
4193                        t[(p - 1) as usize] as usize,
4194                        usize::from(t[(p - 2) as usize] > t[(p - 1) as usize]),
4195                    );
4196                    buckets[v] -= 1;
4197                    let slot =
4198                        usize::try_from(buckets[v]).expect("bucket slot must be non-negative");
4199                    sa[slot] = (p - 1)
4200                        | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
4201                    buckets[distinct_offset + v] = d;
4202
4203                    if block_start == 0 {
4204                        break;
4205                    }
4206                    block_start -= 1;
4207                }
4208            } else {
4209                d = partial_sorting_scan_right_to_left_8u_block_omp(
4210                    t,
4211                    sa,
4212                    k,
4213                    buckets,
4214                    d,
4215                    FastSint::try_from(block_end + 1).expect("block start must fit FastSint"),
4216                    FastSint::try_from(block_size).expect("block size must fit FastSint"),
4217                    threads,
4218                    thread_state,
4219                );
4220                block_start = block_end;
4221            }
4222        }
4223    }
4224}
4225
4226/// Internal helper: partial gsa scan right to left 8u (OpenMP variant).
4227#[doc(hidden)]
4228pub fn partial_gsa_scan_right_to_left_8u_omp(
4229    t: &[u8],
4230    sa: &mut [SaSint],
4231    n: SaSint,
4232    k: SaSint,
4233    buckets: &mut [SaSint],
4234    first_lms_suffix: SaSint,
4235    left_suffixes_count: SaSint,
4236    mut d: SaSint,
4237    threads: SaSint,
4238    thread_state: &mut [ThreadState],
4239) {
4240    let scan_start = left_suffixes_count as FastSint + 1;
4241    let scan_end = n as FastSint - first_lms_suffix as FastSint;
4242
4243    if threads == 1 || (scan_end - scan_start) < 65_536 {
4244        let _ =
4245            partial_gsa_scan_right_to_left_8u(t, sa, buckets, d, scan_start, scan_end - scan_start);
4246        return;
4247    }
4248
4249    let distinct_offset = 2 * ALPHABET_SIZE;
4250    let mut block_start = usize::try_from(scan_end - 1).expect("scan end must be positive");
4251    let scan_start_usize = usize::try_from(scan_start).expect("scan_start must be non-negative");
4252    let threads_usize = usize::try_from(threads)
4253        .expect("threads must be non-negative")
4254        .min(thread_state.len())
4255        .max(1);
4256
4257    while block_start >= scan_start_usize {
4258        if sa[block_start] == 0 {
4259            if block_start == 0 {
4260                break;
4261            }
4262            block_start -= 1;
4263        } else {
4264            let mut block_max_end = block_start.saturating_sub(
4265                threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize),
4266            );
4267            if block_max_end + 1 < scan_start_usize {
4268                block_max_end = scan_start_usize.saturating_sub(1);
4269            }
4270            let mut block_end = block_start - 1;
4271            while block_end > block_max_end && sa[block_end] != 0 {
4272                block_end -= 1;
4273            }
4274            let block_size = block_start - block_end;
4275
4276            if block_size < 32 {
4277                while block_start > block_end {
4278                    let p = sa[block_start];
4279                    d += SaSint::from(p < 0);
4280                    let p = p & SAINT_MAX;
4281                    let v = buckets_index2(
4282                        t[(p - 1) as usize] as usize,
4283                        usize::from(t[(p - 2) as usize] > t[(p - 1) as usize]),
4284                    );
4285                    if v != 1 {
4286                        buckets[v] -= 1;
4287                        let slot =
4288                            usize::try_from(buckets[v]).expect("bucket slot must be non-negative");
4289                        sa[slot] = (p - 1)
4290                            | (((buckets[distinct_offset + v] != d) as SaSint) << (SAINT_BIT - 1));
4291                        buckets[distinct_offset + v] = d;
4292                    }
4293
4294                    if block_start == 0 {
4295                        break;
4296                    }
4297                    block_start -= 1;
4298                }
4299            } else {
4300                d = partial_gsa_scan_right_to_left_8u_block_omp(
4301                    t,
4302                    sa,
4303                    k,
4304                    buckets,
4305                    d,
4306                    FastSint::try_from(block_end + 1).expect("block start must fit FastSint"),
4307                    FastSint::try_from(block_size).expect("block size must fit FastSint"),
4308                    threads,
4309                    thread_state,
4310                );
4311                block_start = block_end;
4312            }
4313        }
4314    }
4315}
4316
4317/// Internal helper: partial sorting scan right to left 32s 6k.
4318#[doc(hidden)]
4319pub fn partial_sorting_scan_right_to_left_32s_6k(
4320    t: &[SaSint],
4321    sa: &mut [SaSint],
4322    buckets: &mut [SaSint],
4323    mut d: SaSint,
4324    omp_block_start: FastSint,
4325    omp_block_size: FastSint,
4326) -> SaSint {
4327    if omp_block_size <= 0 {
4328        return d;
4329    }
4330
4331    let prefetch_distance: FastSint = 64;
4332    let mut i = omp_block_start + omp_block_size - 1;
4333    let mut j = omp_block_start + 2 * prefetch_distance + 1;
4334
4335    while i >= j {
4336        let mut p0 = sa[i as usize];
4337        d += SaSint::from(p0 < 0);
4338        p0 &= SAINT_MAX;
4339        let p0u = p0 as usize;
4340        let v0 = buckets_index4(t[p0u - 1] as usize, usize::from(t[p0u - 2] > t[p0u - 1]));
4341        buckets[v0] -= 1;
4342        let slot0 = buckets[v0] as usize;
4343        sa[slot0] = (p0 - 1) | (((buckets[2 + v0] != d) as SaSint) << (SAINT_BIT - 1));
4344        buckets[2 + v0] = d;
4345
4346        let mut p1 = sa[(i - 1) as usize];
4347        d += SaSint::from(p1 < 0);
4348        p1 &= SAINT_MAX;
4349        let p1u = p1 as usize;
4350        let v1 = buckets_index4(t[p1u - 1] as usize, usize::from(t[p1u - 2] > t[p1u - 1]));
4351        buckets[v1] -= 1;
4352        let slot1 = buckets[v1] as usize;
4353        sa[slot1] = (p1 - 1) | (((buckets[2 + v1] != d) as SaSint) << (SAINT_BIT - 1));
4354        buckets[2 + v1] = d;
4355
4356        i -= 2;
4357    }
4358
4359    j -= 2 * prefetch_distance + 1;
4360    while i >= j {
4361        let mut p = sa[i as usize];
4362        d += SaSint::from(p < 0);
4363        p &= SAINT_MAX;
4364        let pu = p as usize;
4365        let v = buckets_index4(t[pu - 1] as usize, usize::from(t[pu - 2] > t[pu - 1]));
4366
4367        buckets[v] -= 1;
4368        let slot = buckets[v] as usize;
4369        sa[slot] = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
4370        buckets[2 + v] = d;
4371        i -= 1;
4372    }
4373
4374    d
4375}
4376
4377/// Internal helper: partial sorting scan right to left 32s 4k.
4378#[doc(hidden)]
4379pub fn partial_sorting_scan_right_to_left_32s_4k(
4380    t: &[SaSint],
4381    sa: &mut [SaSint],
4382    k: SaSint,
4383    buckets: &mut [SaSint],
4384    mut d: SaSint,
4385    omp_block_start: FastSint,
4386    omp_block_size: FastSint,
4387) -> SaSint {
4388    if omp_block_size <= 0 {
4389        return d;
4390    }
4391
4392    let k_usize = usize::try_from(k).expect("k must be non-negative");
4393    let prefetch_distance: FastSint = 64;
4394    let induction_offset = 3 * k_usize;
4395
4396    let mut i = omp_block_start + omp_block_size - 1;
4397    let mut j = omp_block_start + 2 * prefetch_distance + 1;
4398
4399    while i >= j {
4400        let i0 = i as usize;
4401        let mut p0 = sa[i0];
4402        if p0 > 0 {
4403            sa[i0] = 0;
4404            d += p0 >> (SUFFIX_GROUP_BIT - 1);
4405            p0 &= !SUFFIX_GROUP_MARKER;
4406
4407            let p0u = p0 as usize;
4408            let c0 = t[p0u - 1];
4409            let f0 = usize::from(t[p0u - 2] > c0);
4410            let v0 = buckets_index2(c0 as usize, f0);
4411            let c0u = c0 as usize;
4412            buckets[induction_offset + c0u] -= 1;
4413            let slot0 = buckets[induction_offset + c0u] as usize;
4414            sa[slot0] = (p0 - 1)
4415                | ((f0 as SaSint) << (SAINT_BIT - 1))
4416                | (((buckets[v0] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4417            buckets[v0] = d;
4418        }
4419
4420        let i1 = (i - 1) as usize;
4421        let mut p1 = sa[i1];
4422        if p1 > 0 {
4423            sa[i1] = 0;
4424            d += p1 >> (SUFFIX_GROUP_BIT - 1);
4425            p1 &= !SUFFIX_GROUP_MARKER;
4426
4427            let p1u = p1 as usize;
4428            let c1 = t[p1u - 1];
4429            let f1 = usize::from(t[p1u - 2] > c1);
4430            let v1 = buckets_index2(c1 as usize, f1);
4431            let c1u = c1 as usize;
4432            buckets[induction_offset + c1u] -= 1;
4433            let slot1 = buckets[induction_offset + c1u] as usize;
4434            sa[slot1] = (p1 - 1)
4435                | ((f1 as SaSint) << (SAINT_BIT - 1))
4436                | (((buckets[v1] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4437            buckets[v1] = d;
4438        }
4439
4440        i -= 2;
4441    }
4442
4443    j -= 2 * prefetch_distance + 1;
4444    while i >= j {
4445        let iu = i as usize;
4446        let mut p = sa[iu];
4447        if p > 0 {
4448            sa[iu] = 0;
4449            d += p >> (SUFFIX_GROUP_BIT - 1);
4450            p &= !SUFFIX_GROUP_MARKER;
4451
4452            let pu = p as usize;
4453            let c = t[pu - 1];
4454            let f = usize::from(t[pu - 2] > c);
4455            let v = buckets_index2(c as usize, f);
4456            let cu = c as usize;
4457            buckets[induction_offset + cu] -= 1;
4458            let slot = buckets[induction_offset + cu] as usize;
4459            sa[slot] = (p - 1)
4460                | ((f as SaSint) << (SAINT_BIT - 1))
4461                | (((buckets[v] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
4462            buckets[v] = d;
4463        }
4464        i -= 1;
4465    }
4466
4467    d
4468}
4469
4470/// Internal helper: partial sorting scan right to left 32s 1k.
4471#[doc(hidden)]
4472pub fn partial_sorting_scan_right_to_left_32s_1k(
4473    t: &[SaSint],
4474    sa: &mut [SaSint],
4475    induction_bucket: &mut [SaSint],
4476    omp_block_start: FastSint,
4477    omp_block_size: FastSint,
4478) {
4479    if omp_block_size <= 0 {
4480        return;
4481    }
4482
4483    let prefetch_distance = 64usize;
4484    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4485    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4486    let mut i = (start + size - 1) as isize;
4487    let mut j = (start + 2 * prefetch_distance + 1) as isize;
4488
4489    while i >= j {
4490        let p0 = sa[i as usize];
4491        if p0 > 0 {
4492            sa[i as usize] = 0;
4493            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
4494            let bucket_index0 =
4495                usize::try_from(t[p0_usize - 1]).expect("bucket symbol must be non-negative");
4496            induction_bucket[bucket_index0] -= 1;
4497            let slot0 = usize::try_from(induction_bucket[bucket_index0])
4498                .expect("bucket slot must be non-negative");
4499            sa[slot0] = (p0 - 1)
4500                | ((usize::from(t[p0_usize - 2] > t[p0_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4501        }
4502        let p1 = sa[(i - 1) as usize];
4503        if p1 > 0 {
4504            sa[(i - 1) as usize] = 0;
4505            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
4506            let bucket_index1 =
4507                usize::try_from(t[p1_usize - 1]).expect("bucket symbol must be non-negative");
4508            induction_bucket[bucket_index1] -= 1;
4509            let slot1 = usize::try_from(induction_bucket[bucket_index1])
4510                .expect("bucket slot must be non-negative");
4511            sa[slot1] = (p1 - 1)
4512                | ((usize::from(t[p1_usize - 2] > t[p1_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4513        }
4514
4515        i -= 2;
4516    }
4517
4518    j -= (2 * prefetch_distance + 1) as isize;
4519    while i >= j {
4520        let p = sa[i as usize];
4521        if p > 0 {
4522            sa[i as usize] = 0;
4523            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4524            let bucket_index =
4525                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative");
4526            induction_bucket[bucket_index] -= 1;
4527            let slot = usize::try_from(induction_bucket[bucket_index])
4528                .expect("bucket slot must be non-negative");
4529            sa[slot] = (p - 1)
4530                | ((usize::from(t[p_usize - 2] > t[p_usize - 1]) as SaSint) << (SAINT_BIT - 1));
4531        }
4532        if i == 0 {
4533            break;
4534        }
4535        i -= 1;
4536    }
4537}
4538
4539/// Internal helper: partial sorting scan right to left 32s 6k block gather.
4540#[doc(hidden)]
4541pub fn partial_sorting_scan_right_to_left_32s_6k_block_gather(
4542    t: &[SaSint],
4543    sa: &[SaSint],
4544    cache: &mut [ThreadCache],
4545    omp_block_start: FastSint,
4546    omp_block_size: FastSint,
4547) {
4548    if omp_block_size <= 0 {
4549        return;
4550    }
4551
4552    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4553    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4554    for offset in 0..size {
4555        let i = start + offset;
4556        let mut p = sa[i];
4557        let mut symbol = 0usize;
4558        p &= SAINT_MAX;
4559        if p != 0 {
4560            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4561            symbol = buckets_index4(
4562                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
4563                usize::from(t[p_usize - 2] > t[p_usize - 1]),
4564            );
4565        }
4566        cache[offset].index = sa[i];
4567        cache[offset].symbol = symbol as SaSint;
4568    }
4569}
4570
4571/// Internal helper: partial sorting scan right to left 32s 4k block gather.
4572#[doc(hidden)]
4573pub fn partial_sorting_scan_right_to_left_32s_4k_block_gather(
4574    t: &[SaSint],
4575    sa: &mut [SaSint],
4576    cache: &mut [ThreadCache],
4577    omp_block_start: FastSint,
4578    omp_block_size: FastSint,
4579) {
4580    if omp_block_size <= 0 {
4581        return;
4582    }
4583
4584    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4585    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4586    for offset in 0..size {
4587        let i = start + offset;
4588        let mut symbol = SAINT_MIN;
4589        let mut p = sa[i];
4590        if p > 0 {
4591            sa[i] = 0;
4592            cache[offset].index = p;
4593            p &= !SUFFIX_GROUP_MARKER;
4594            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
4595            symbol = buckets_index2(
4596                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
4597                usize::from(t[p_usize - 2] > t[p_usize - 1]),
4598            ) as SaSint;
4599        }
4600        cache[offset].symbol = symbol;
4601    }
4602}
4603
4604/// Internal helper: partial sorting scan right to left 32s 1k block gather.
4605#[doc(hidden)]
4606pub fn partial_sorting_scan_right_to_left_32s_1k_block_gather(
4607    t: &[SaSint],
4608    sa: &mut [SaSint],
4609    cache: &mut [ThreadCache],
4610    omp_block_start: FastSint,
4611    omp_block_size: FastSint,
4612) {
4613    if omp_block_size <= 0 {
4614        return;
4615    }
4616    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
4617    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4618    for offset in 0..size {
4619        let i = start + offset;
4620        let mut symbol = SAINT_MIN;
4621        let p = sa[i];
4622        if p > 0 {
4623            sa[i] = 0;
4624            cache[offset].index = (p - 1)
4625                | ((usize::from(t[p as usize - 2] > t[p as usize - 1]) as SaSint)
4626                    << (SAINT_BIT - 1));
4627            symbol = t[p as usize - 1];
4628        }
4629        cache[offset].symbol = symbol;
4630    }
4631}
4632
4633/// Internal helper: partial sorting scan right to left 32s 6k block sort.
4634#[doc(hidden)]
4635pub fn partial_sorting_scan_right_to_left_32s_6k_block_sort(
4636    t: &[SaSint],
4637    buckets: &mut [SaSint],
4638    mut d: SaSint,
4639    cache: &mut [ThreadCache],
4640    omp_block_start: FastSint,
4641    omp_block_size: FastSint,
4642) -> SaSint {
4643    if omp_block_size <= 0 {
4644        return d;
4645    }
4646
4647    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4648    let mut i = size;
4649    while i > 0 {
4650        i -= 1;
4651
4652        let v = usize::try_from(cache[i].symbol).expect("cache symbol must be non-negative");
4653        let p = cache[i].index;
4654        d += SaSint::from(p < 0);
4655        buckets[v] -= 1;
4656        let target = buckets[v];
4657        cache[i].symbol = target;
4658        cache[i].index = (p - 1) | (((buckets[2 + v] != d) as SaSint) << (SAINT_BIT - 1));
4659        buckets[2 + v] = d;
4660
4661        let block_end = omp_block_start as SaSint + omp_block_size as SaSint;
4662        if target >= omp_block_start as SaSint && target < block_end {
4663            let s = usize::try_from(target - omp_block_start as SaSint)
4664                .expect("cache slot must be non-negative");
4665            let q = cache[i].index & SAINT_MAX;
4666            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
4667            cache[s].index = cache[i].index;
4668            cache[s].symbol = buckets_index4(
4669                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
4670                usize::from(t[q_usize - 2] > t[q_usize - 1]),
4671            ) as SaSint;
4672        }
4673    }
4674
4675    d
4676}
4677
4678/// Internal helper: partial sorting scan right to left 32s 4k block sort.
4679#[doc(hidden)]
4680pub fn partial_sorting_scan_right_to_left_32s_4k_block_sort(
4681    t: &[SaSint],
4682    k: SaSint,
4683    buckets: &mut [SaSint],
4684    mut d: SaSint,
4685    cache: &mut [ThreadCache],
4686    omp_block_start: FastSint,
4687    omp_block_size: FastSint,
4688) -> SaSint {
4689    if omp_block_size <= 0 {
4690        return d;
4691    }
4692
4693    let k_usize = usize::try_from(k).expect("k must be non-negative");
4694    let (distinct_names, tail) = buckets.split_at_mut(2 * k_usize);
4695    let induction_bucket = &mut tail[k_usize..2 * k_usize];
4696
4697    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4698    let mut i = size;
4699    while i > 0 {
4700        i -= 1;
4701
4702        let v = cache[i].symbol;
4703        if v >= 0 {
4704            let p = cache[i].index;
4705            d += p >> (SUFFIX_GROUP_BIT - 1);
4706            let bucket_index = usize::try_from(v >> 1).expect("bucket symbol must be non-negative");
4707            induction_bucket[bucket_index] -= 1;
4708            let target = induction_bucket[bucket_index];
4709            cache[i].symbol = target;
4710            cache[i].index = (p - 1)
4711                | ((v & 1) << (SAINT_BIT - 1))
4712                | (((distinct_names
4713                    [usize::try_from(v).expect("bucket symbol must be non-negative")]
4714                    != d) as SaSint)
4715                    << (SUFFIX_GROUP_BIT - 1));
4716            distinct_names[usize::try_from(v).expect("bucket symbol must be non-negative")] = d;
4717
4718            let block_end = omp_block_start as SaSint + omp_block_size as SaSint;
4719            if target >= omp_block_start as SaSint && target < block_end {
4720                let ni = usize::try_from(target - omp_block_start as SaSint)
4721                    .expect("cache slot must be non-negative");
4722                let mut np = cache[i].index;
4723                if np > 0 {
4724                    cache[i].index = 0;
4725                    cache[ni].index = np;
4726                    np &= !SUFFIX_GROUP_MARKER;
4727                    let np_usize = usize::try_from(np).expect("suffix index must be non-negative");
4728                    cache[ni].symbol = buckets_index2(
4729                        usize::try_from(t[np_usize - 1])
4730                            .expect("bucket symbol must be non-negative"),
4731                        usize::from(t[np_usize - 2] > t[np_usize - 1]),
4732                    ) as SaSint;
4733                }
4734            }
4735        }
4736    }
4737
4738    d
4739}
4740
4741/// Internal helper: partial sorting scan right to left 32s 1k block sort.
4742#[doc(hidden)]
4743pub fn partial_sorting_scan_right_to_left_32s_1k_block_sort(
4744    t: &[SaSint],
4745    induction_bucket: &mut [SaSint],
4746    cache: &mut [ThreadCache],
4747    omp_block_start: FastSint,
4748    omp_block_size: FastSint,
4749) {
4750    if omp_block_size <= 0 {
4751        return;
4752    }
4753    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
4754    let mut offset = size;
4755
4756    while offset > 0 {
4757        offset -= 1;
4758        let v = cache[offset].symbol;
4759        if v >= 0 {
4760            let bucket_index = v as usize;
4761            induction_bucket[bucket_index] -= 1;
4762            let target = induction_bucket[bucket_index];
4763            cache[offset].symbol = target;
4764            let block_end = omp_block_start as SaSint + omp_block_size as SaSint;
4765            if target >= omp_block_start as SaSint && target < block_end {
4766                let ni = usize::try_from(target - omp_block_start as SaSint)
4767                    .expect("cache slot must be non-negative");
4768                let np = cache[offset].index;
4769                if np > 0 {
4770                    cache[offset].index = 0;
4771                    cache[ni].index = (np - 1)
4772                        | ((usize::from(t[np as usize - 2] > t[np as usize - 1]) as SaSint)
4773                            << (SAINT_BIT - 1));
4774                    cache[ni].symbol = t[np as usize - 1];
4775                }
4776            }
4777        }
4778    }
4779}
4780
4781/// Internal helper: partial sorting scan right to left 32s 6k block (OpenMP variant).
4782#[doc(hidden)]
4783pub fn partial_sorting_scan_right_to_left_32s_6k_block_omp(
4784    t: &[SaSint],
4785    sa: &mut [SaSint],
4786    buckets: &mut [SaSint],
4787    mut d: SaSint,
4788    cache: &mut [ThreadCache],
4789    block_start: FastSint,
4790    block_size: FastSint,
4791    threads: SaSint,
4792) -> SaSint {
4793    if block_size <= 0 {
4794        return d;
4795    }
4796    if threads == 1 || block_size < 16_384 {
4797        return partial_sorting_scan_right_to_left_32s_6k(
4798            t,
4799            sa,
4800            buckets,
4801            d,
4802            block_start,
4803            block_size,
4804        );
4805    }
4806
4807    let threads_usize = usize::try_from(threads)
4808        .expect("threads must be non-negative")
4809        .max(1);
4810    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4811    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4812    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4813
4814    for omp_thread_num in 0..omp_num_threads {
4815        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4816            omp_block_stride
4817        } else {
4818            block_size_usize - omp_thread_num * omp_block_stride
4819        };
4820        let omp_block_start = usize::try_from(block_start)
4821            .expect("block_start must be non-negative")
4822            + omp_thread_num * omp_block_stride;
4823        if omp_block_size > 0 {
4824            partial_sorting_scan_right_to_left_32s_6k_block_gather(
4825                t,
4826                sa,
4827                &mut cache[omp_thread_num * omp_block_stride
4828                    ..omp_thread_num * omp_block_stride + omp_block_size],
4829                omp_block_start as FastSint,
4830                omp_block_size as FastSint,
4831            );
4832        }
4833    }
4834
4835    d = partial_sorting_scan_right_to_left_32s_6k_block_sort(
4836        t,
4837        buckets,
4838        d,
4839        &mut cache[..block_size_usize],
4840        block_start,
4841        block_size,
4842    );
4843
4844    for omp_thread_num in 0..omp_num_threads {
4845        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4846            omp_block_stride
4847        } else {
4848            block_size_usize - omp_thread_num * omp_block_stride
4849        };
4850        let cache_start = omp_thread_num * omp_block_stride;
4851        if omp_block_size > 0 {
4852            place_cached_suffixes(sa, &cache[cache_start..], 0, omp_block_size as FastSint);
4853        }
4854    }
4855
4856    d
4857}
4858
4859/// Internal helper: partial sorting scan right to left 32s 4k block (OpenMP variant).
4860#[doc(hidden)]
4861pub fn partial_sorting_scan_right_to_left_32s_4k_block_omp(
4862    t: &[SaSint],
4863    sa: &mut [SaSint],
4864    k: SaSint,
4865    buckets: &mut [SaSint],
4866    mut d: SaSint,
4867    cache: &mut [ThreadCache],
4868    block_start: FastSint,
4869    block_size: FastSint,
4870    threads: SaSint,
4871) -> SaSint {
4872    if block_size <= 0 {
4873        return d;
4874    }
4875    if threads == 1 || block_size < 16_384 {
4876        return partial_sorting_scan_right_to_left_32s_4k(
4877            t,
4878            sa,
4879            k,
4880            buckets,
4881            d,
4882            block_start,
4883            block_size,
4884        );
4885    }
4886
4887    let threads_usize = usize::try_from(threads)
4888        .expect("threads must be non-negative")
4889        .max(1);
4890    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4891    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4892    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4893
4894    for omp_thread_num in 0..omp_num_threads {
4895        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4896            omp_block_stride
4897        } else {
4898            block_size_usize - omp_thread_num * omp_block_stride
4899        };
4900        let omp_block_start = usize::try_from(block_start)
4901            .expect("block_start must be non-negative")
4902            + omp_thread_num * omp_block_stride;
4903        if omp_block_size > 0 {
4904            partial_sorting_scan_right_to_left_32s_4k_block_gather(
4905                t,
4906                sa,
4907                &mut cache[omp_thread_num * omp_block_stride
4908                    ..omp_thread_num * omp_block_stride + omp_block_size],
4909                omp_block_start as FastSint,
4910                omp_block_size as FastSint,
4911            );
4912        }
4913    }
4914
4915    d = partial_sorting_scan_right_to_left_32s_4k_block_sort(
4916        t,
4917        k,
4918        buckets,
4919        d,
4920        &mut cache[..block_size_usize],
4921        block_start,
4922        block_size,
4923    );
4924
4925    for omp_thread_num in 0..omp_num_threads {
4926        let omp_block_start = omp_thread_num * omp_block_stride;
4927        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4928            omp_block_stride
4929        } else {
4930            block_size_usize - omp_block_start
4931        };
4932        if omp_block_size > 0 {
4933            compact_and_place_cached_suffixes(
4934                sa,
4935                &mut cache[omp_block_start..],
4936                0,
4937                omp_block_size as FastSint,
4938            );
4939        }
4940    }
4941
4942    d
4943}
4944
4945/// Internal helper: partial sorting scan right to left 32s 1k block (OpenMP variant).
4946#[doc(hidden)]
4947pub fn partial_sorting_scan_right_to_left_32s_1k_block_omp(
4948    t: &[SaSint],
4949    sa: &mut [SaSint],
4950    buckets: &mut [SaSint],
4951    cache: &mut [ThreadCache],
4952    block_start: FastSint,
4953    block_size: FastSint,
4954    threads: SaSint,
4955) {
4956    if block_size <= 0 {
4957        return;
4958    }
4959    if threads == 1 || block_size < 16_384 {
4960        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, block_start, block_size);
4961        return;
4962    }
4963
4964    let threads_usize = usize::try_from(threads)
4965        .expect("threads must be non-negative")
4966        .max(1);
4967    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
4968    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
4969    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
4970    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
4971
4972    for omp_thread_num in 0..omp_num_threads {
4973        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
4974            omp_block_stride
4975        } else {
4976            block_size_usize - omp_thread_num * omp_block_stride
4977        };
4978        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
4979        if omp_block_size > 0 {
4980            partial_sorting_scan_right_to_left_32s_1k_block_gather(
4981                t,
4982                sa,
4983                &mut cache[omp_thread_num * omp_block_stride
4984                    ..omp_thread_num * omp_block_stride + omp_block_size],
4985                omp_block_start as FastSint,
4986                omp_block_size as FastSint,
4987            );
4988        }
4989    }
4990
4991    let cache = &mut cache[..block_size_usize];
4992    partial_sorting_scan_right_to_left_32s_1k_block_sort(
4993        t,
4994        buckets,
4995        cache,
4996        block_start,
4997        block_size,
4998    );
4999    for omp_thread_num in 0..omp_num_threads {
5000        let omp_block_start = omp_thread_num * omp_block_stride;
5001        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5002            omp_block_stride
5003        } else {
5004            block_size_usize - omp_block_start
5005        };
5006        if omp_block_size > 0 {
5007            compact_and_place_cached_suffixes(
5008                sa,
5009                &mut cache[omp_block_start..],
5010                0,
5011                omp_block_size as FastSint,
5012            );
5013        }
5014    }
5015}
5016
5017/// Internal helper: partial sorting scan left to right 32s 6k block gather.
5018#[doc(hidden)]
5019pub fn partial_sorting_scan_left_to_right_32s_6k_block_gather(
5020    t: &[SaSint],
5021    sa: &mut [SaSint],
5022    cache: &mut [ThreadCache],
5023    omp_block_start: FastSint,
5024    omp_block_size: FastSint,
5025) {
5026    if omp_block_size <= 0 {
5027        return;
5028    }
5029
5030    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5031    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5032    for offset in 0..size {
5033        let i = start + offset;
5034        let p = sa[i];
5035        cache[offset].index = p;
5036        let q = p & SAINT_MAX;
5037        cache[offset].symbol = if q != 0 {
5038            buckets_index4(
5039                usize::try_from(t[q as usize - 1]).expect("bucket symbol must be non-negative"),
5040                usize::from(t[q as usize - 2] >= t[q as usize - 1]),
5041            ) as SaSint
5042        } else {
5043            0
5044        };
5045    }
5046}
5047
5048/// Internal helper: partial sorting scan left to right 32s 4k block gather.
5049#[doc(hidden)]
5050pub fn partial_sorting_scan_left_to_right_32s_4k_block_gather(
5051    t: &[SaSint],
5052    sa: &mut [SaSint],
5053    cache: &mut [ThreadCache],
5054    omp_block_start: FastSint,
5055    omp_block_size: FastSint,
5056) {
5057    if omp_block_size <= 0 {
5058        return;
5059    }
5060
5061    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5062    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5063    for offset in 0..size {
5064        let i = start + offset;
5065        let mut symbol = SAINT_MIN;
5066        let mut p = sa[i];
5067        if p > 0 {
5068            cache[offset].index = p;
5069            p &= !SUFFIX_GROUP_MARKER;
5070            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
5071            symbol = buckets_index2(
5072                usize::try_from(t[p_usize - 1]).expect("bucket symbol must be non-negative"),
5073                usize::from(t[p_usize - 2] < t[p_usize - 1]),
5074            ) as SaSint;
5075            p = 0;
5076        }
5077        cache[offset].symbol = symbol;
5078        sa[i] = p & SAINT_MAX;
5079    }
5080}
5081
5082/// Internal helper: partial sorting scan left to right 32s 1k block gather.
5083#[doc(hidden)]
5084pub fn partial_sorting_scan_left_to_right_32s_1k_block_gather(
5085    t: &[SaSint],
5086    sa: &mut [SaSint],
5087    cache: &mut [ThreadCache],
5088    omp_block_start: FastSint,
5089    omp_block_size: FastSint,
5090) {
5091    if omp_block_size <= 0 {
5092        return;
5093    }
5094    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5095    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5096    for offset in 0..size {
5097        let i = start + offset;
5098        let mut symbol = SAINT_MIN;
5099        let mut p = sa[i];
5100        if p > 0 {
5101            cache[offset].index = (p - 1)
5102                | ((usize::from(t[p as usize - 2] < t[p as usize - 1]) as SaSint)
5103                    << (SAINT_BIT - 1));
5104            symbol = t[p as usize - 1];
5105            p = 0;
5106        }
5107        cache[offset].symbol = symbol;
5108        sa[i] = p & SAINT_MAX;
5109    }
5110}
5111
5112/// Internal helper: partial sorting scan left to right 32s 6k block sort.
5113#[doc(hidden)]
5114pub fn partial_sorting_scan_left_to_right_32s_6k_block_sort(
5115    t: &[SaSint],
5116    buckets: &mut [SaSint],
5117    mut d: SaSint,
5118    cache: &mut [ThreadCache],
5119    omp_block_start: FastSint,
5120    omp_block_size: FastSint,
5121) -> SaSint {
5122    if omp_block_size <= 0 {
5123        return d;
5124    }
5125
5126    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5127    let block_end =
5128        start + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5129
5130    let mut i = start;
5131    let mut j = block_end.saturating_sub(65);
5132    while i < j {
5133        let cache_i0 = i - start;
5134        let cache_i1 = cache_i0 + 1;
5135
5136        let v0 =
5137            usize::try_from(cache[cache_i0].symbol).expect("cache symbol must be non-negative");
5138        let p0 = cache[cache_i0].index;
5139        d += SaSint::from(p0 < 0);
5140        cache[cache_i0].symbol = buckets[v0];
5141        buckets[v0] += 1;
5142        cache[cache_i0].index =
5143            (p0 - 1) | ((SaSint::from(buckets[2 + v0] != d)) << (SAINT_BIT - 1));
5144        buckets[2 + v0] = d;
5145        if cache[cache_i0].symbol >= omp_block_start as SaSint
5146            && cache[cache_i0].symbol < block_end as SaSint
5147        {
5148            let s = usize::try_from(cache[cache_i0].symbol - omp_block_start as SaSint)
5149                .expect("cache slot must be non-negative");
5150            let q = cache[cache_i0].index & SAINT_MAX;
5151            cache[s].index = cache[cache_i0].index;
5152            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5153            cache[s].symbol = buckets_index4(
5154                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5155                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5156            ) as SaSint;
5157        }
5158
5159        let v1 =
5160            usize::try_from(cache[cache_i1].symbol).expect("cache symbol must be non-negative");
5161        let p1 = cache[cache_i1].index;
5162        d += SaSint::from(p1 < 0);
5163        cache[cache_i1].symbol = buckets[v1];
5164        buckets[v1] += 1;
5165        cache[cache_i1].index =
5166            (p1 - 1) | ((SaSint::from(buckets[2 + v1] != d)) << (SAINT_BIT - 1));
5167        buckets[2 + v1] = d;
5168        if cache[cache_i1].symbol >= omp_block_start as SaSint
5169            && cache[cache_i1].symbol < block_end as SaSint
5170        {
5171            let s = usize::try_from(cache[cache_i1].symbol - omp_block_start as SaSint)
5172                .expect("cache slot must be non-negative");
5173            let q = cache[cache_i1].index & SAINT_MAX;
5174            cache[s].index = cache[cache_i1].index;
5175            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5176            cache[s].symbol = buckets_index4(
5177                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5178                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5179            ) as SaSint;
5180        }
5181
5182        i += 2;
5183    }
5184
5185    j += 65;
5186    while i < j {
5187        let cache_i = i - start;
5188        let v = usize::try_from(cache[cache_i].symbol).expect("cache symbol must be non-negative");
5189        let p = cache[cache_i].index;
5190        d += SaSint::from(p < 0);
5191        cache[cache_i].symbol = buckets[v];
5192        buckets[v] += 1;
5193        cache[cache_i].index = (p - 1) | ((SaSint::from(buckets[2 + v] != d)) << (SAINT_BIT - 1));
5194        buckets[2 + v] = d;
5195        if cache[cache_i].symbol >= omp_block_start as SaSint
5196            && cache[cache_i].symbol < block_end as SaSint
5197        {
5198            let s = usize::try_from(cache[cache_i].symbol - omp_block_start as SaSint)
5199                .expect("cache slot must be non-negative");
5200            let q = cache[cache_i].index & SAINT_MAX;
5201            cache[s].index = cache[cache_i].index;
5202            let q_usize = usize::try_from(q).expect("suffix index must be non-negative");
5203            cache[s].symbol = buckets_index4(
5204                usize::try_from(t[q_usize - 1]).expect("bucket symbol must be non-negative"),
5205                usize::from(t[q_usize - 2] >= t[q_usize - 1]),
5206            ) as SaSint;
5207        }
5208        i += 1;
5209    }
5210
5211    d
5212}
5213
5214/// Internal helper: partial sorting scan left to right 32s 4k block sort.
5215#[doc(hidden)]
5216pub fn partial_sorting_scan_left_to_right_32s_4k_block_sort(
5217    t: &[SaSint],
5218    k: SaSint,
5219    buckets: &mut [SaSint],
5220    mut d: SaSint,
5221    cache: &mut [ThreadCache],
5222    omp_block_start: FastSint,
5223    omp_block_size: FastSint,
5224) -> SaSint {
5225    if omp_block_size <= 0 {
5226        return d;
5227    }
5228
5229    let k_usize = usize::try_from(k).expect("k must be non-negative");
5230    let (distinct_names, tail) = buckets.split_at_mut(2 * k_usize);
5231    let induction_bucket = &mut tail[..k_usize];
5232
5233    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5234    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5235    let block_end = start + size;
5236
5237    for offset in 0..size {
5238        let v = cache[offset].symbol;
5239        if v >= 0 {
5240            let p = cache[offset].index;
5241            d += p >> (SUFFIX_GROUP_BIT - 1);
5242
5243            let bucket_index = usize::try_from(v >> 1).expect("bucket index must be non-negative");
5244            let v_usize = usize::try_from(v).expect("cache symbol must be non-negative");
5245            let target = induction_bucket[bucket_index];
5246            induction_bucket[bucket_index] += 1;
5247
5248            cache[offset].symbol = target;
5249            cache[offset].index = (p - 1)
5250                | ((v & 1) << (SAINT_BIT - 1))
5251                | (((distinct_names[v_usize] != d) as SaSint) << (SUFFIX_GROUP_BIT - 1));
5252            distinct_names[v_usize] = d;
5253
5254            if target >= omp_block_start as SaSint && target < block_end as SaSint {
5255                let ni = usize::try_from(target - omp_block_start as SaSint)
5256                    .expect("cache slot must be non-negative");
5257                let mut np = cache[offset].index;
5258                if np > 0 {
5259                    cache[ni].index = np;
5260                    np &= !SUFFIX_GROUP_MARKER;
5261                    let np_usize = usize::try_from(np).expect("suffix index must be non-negative");
5262                    cache[ni].symbol = buckets_index2(
5263                        usize::try_from(t[np_usize - 1])
5264                            .expect("bucket symbol must be non-negative"),
5265                        usize::from(t[np_usize - 2] < t[np_usize - 1]),
5266                    ) as SaSint;
5267                    np = 0;
5268                }
5269                cache[offset].index = np & SAINT_MAX;
5270            }
5271        }
5272    }
5273
5274    d
5275}
5276
5277/// Internal helper: partial sorting scan left to right 32s 1k block sort.
5278#[doc(hidden)]
5279pub fn partial_sorting_scan_left_to_right_32s_1k_block_sort(
5280    t: &[SaSint],
5281    induction_bucket: &mut [SaSint],
5282    cache: &mut [ThreadCache],
5283    omp_block_start: FastSint,
5284    omp_block_size: FastSint,
5285) {
5286    if omp_block_size <= 0 {
5287        return;
5288    }
5289    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5290    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5291    let block_end = start + size;
5292
5293    for offset in 0..size {
5294        let v = cache[offset].symbol;
5295        if v >= 0 {
5296            let v_usize = v as usize;
5297            let target = induction_bucket[v_usize];
5298            cache[offset].symbol = target;
5299            induction_bucket[v_usize] += 1;
5300            if target >= omp_block_start as SaSint && target < block_end as SaSint {
5301                let ni = usize::try_from(target - omp_block_start as SaSint)
5302                    .expect("cache slot must be non-negative");
5303                let mut np = cache[offset].index;
5304                if np > 0 {
5305                    cache[ni].index = (np - 1)
5306                        | ((usize::from(t[np as usize - 2] < t[np as usize - 1]) as SaSint)
5307                            << (SAINT_BIT - 1));
5308                    cache[ni].symbol = t[np as usize - 1];
5309                    np = 0;
5310                }
5311                cache[offset].index = np & SAINT_MAX;
5312            }
5313        }
5314    }
5315}
5316
5317/// Internal helper: partial sorting scan left to right 32s 6k block (OpenMP variant).
5318#[doc(hidden)]
5319pub fn partial_sorting_scan_left_to_right_32s_6k_block_omp(
5320    t: &[SaSint],
5321    sa: &mut [SaSint],
5322    buckets: &mut [SaSint],
5323    d: SaSint,
5324    cache: &mut [ThreadCache],
5325    block_start: FastSint,
5326    block_size: FastSint,
5327    threads: SaSint,
5328) -> SaSint {
5329    if block_size <= 0 {
5330        return d;
5331    }
5332    if threads == 1 || block_size < 16_384 {
5333        return partial_sorting_scan_left_to_right_32s_6k(
5334            t,
5335            sa,
5336            buckets,
5337            d,
5338            block_start,
5339            block_size,
5340        );
5341    }
5342
5343    let threads_usize = usize::try_from(threads)
5344        .expect("threads must be non-negative")
5345        .max(1);
5346    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5347    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5348    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5349    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5350
5351    for omp_thread_num in 0..omp_num_threads {
5352        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5353            omp_block_stride
5354        } else {
5355            block_size_usize - omp_thread_num * omp_block_stride
5356        };
5357        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5358        if omp_block_size > 0 {
5359            partial_sorting_scan_left_to_right_32s_6k_block_gather(
5360                t,
5361                sa,
5362                &mut cache[omp_thread_num * omp_block_stride
5363                    ..omp_thread_num * omp_block_stride + omp_block_size],
5364                omp_block_start as FastSint,
5365                omp_block_size as FastSint,
5366            );
5367        }
5368    }
5369
5370    let d = partial_sorting_scan_left_to_right_32s_6k_block_sort(
5371        t,
5372        buckets,
5373        d,
5374        &mut cache[..block_size_usize],
5375        block_start,
5376        block_size,
5377    );
5378
5379    for omp_thread_num in 0..omp_num_threads {
5380        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5381            omp_block_stride
5382        } else {
5383            block_size_usize - omp_thread_num * omp_block_stride
5384        };
5385        if omp_block_size > 0 {
5386            place_cached_suffixes(
5387                sa,
5388                &cache[omp_thread_num * omp_block_stride..],
5389                0,
5390                omp_block_size as FastSint,
5391            );
5392        }
5393    }
5394    d
5395}
5396
5397/// Internal helper: partial sorting scan left to right 32s 4k block (OpenMP variant).
5398#[doc(hidden)]
5399pub fn partial_sorting_scan_left_to_right_32s_4k_block_omp(
5400    t: &[SaSint],
5401    sa: &mut [SaSint],
5402    k: SaSint,
5403    buckets: &mut [SaSint],
5404    d: SaSint,
5405    cache: &mut [ThreadCache],
5406    block_start: FastSint,
5407    block_size: FastSint,
5408    threads: SaSint,
5409) -> SaSint {
5410    if block_size <= 0 {
5411        return d;
5412    }
5413    if threads == 1 || block_size < 16_384 {
5414        return partial_sorting_scan_left_to_right_32s_4k(
5415            t,
5416            sa,
5417            k,
5418            buckets,
5419            d,
5420            block_start,
5421            block_size,
5422        );
5423    }
5424
5425    let threads_usize = usize::try_from(threads)
5426        .expect("threads must be non-negative")
5427        .max(1);
5428    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5429    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5430    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5431    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5432
5433    for omp_thread_num in 0..omp_num_threads {
5434        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5435            omp_block_stride
5436        } else {
5437            block_size_usize - omp_thread_num * omp_block_stride
5438        };
5439        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5440        if omp_block_size > 0 {
5441            partial_sorting_scan_left_to_right_32s_4k_block_gather(
5442                t,
5443                sa,
5444                &mut cache[omp_thread_num * omp_block_stride
5445                    ..omp_thread_num * omp_block_stride + omp_block_size],
5446                omp_block_start as FastSint,
5447                omp_block_size as FastSint,
5448            );
5449        }
5450    }
5451
5452    let cache = &mut cache[..block_size_usize];
5453    let d = partial_sorting_scan_left_to_right_32s_4k_block_sort(
5454        t,
5455        k,
5456        buckets,
5457        d,
5458        cache,
5459        block_start,
5460        block_size,
5461    );
5462
5463    for omp_thread_num in 0..omp_num_threads {
5464        let omp_block_start = omp_thread_num * omp_block_stride;
5465        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5466            omp_block_stride
5467        } else {
5468            block_size_usize - omp_block_start
5469        };
5470        if omp_block_size > 0 {
5471            compact_and_place_cached_suffixes(
5472                sa,
5473                &mut cache[omp_block_start..],
5474                0,
5475                omp_block_size as FastSint,
5476            );
5477        }
5478    }
5479
5480    d
5481}
5482
5483/// Internal helper: partial sorting scan left to right 32s 1k block (OpenMP variant).
5484#[doc(hidden)]
5485pub fn partial_sorting_scan_left_to_right_32s_1k_block_omp(
5486    t: &[SaSint],
5487    sa: &mut [SaSint],
5488    buckets: &mut [SaSint],
5489    cache: &mut [ThreadCache],
5490    block_start: FastSint,
5491    block_size: FastSint,
5492    threads: SaSint,
5493) {
5494    if block_size <= 0 {
5495        return;
5496    }
5497    if threads == 1 || block_size < 16_384 {
5498        partial_sorting_scan_left_to_right_32s_1k(t, sa, buckets, block_start, block_size);
5499        return;
5500    }
5501
5502    let threads_usize = usize::try_from(threads)
5503        .expect("threads must be non-negative")
5504        .max(1);
5505    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
5506    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
5507    let omp_num_threads = threads_usize.min(block_size_usize.max(1));
5508    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
5509
5510    for omp_thread_num in 0..omp_num_threads {
5511        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5512            omp_block_stride
5513        } else {
5514            block_size_usize - omp_thread_num * omp_block_stride
5515        };
5516        let omp_block_start = block_start_usize + omp_thread_num * omp_block_stride;
5517        if omp_block_size > 0 {
5518            partial_sorting_scan_left_to_right_32s_1k_block_gather(
5519                t,
5520                sa,
5521                &mut cache[omp_thread_num * omp_block_stride
5522                    ..omp_thread_num * omp_block_stride + omp_block_size],
5523                omp_block_start as FastSint,
5524                omp_block_size as FastSint,
5525            );
5526        }
5527    }
5528
5529    let cache = &mut cache[..block_size_usize];
5530    partial_sorting_scan_left_to_right_32s_1k_block_sort(
5531        t,
5532        buckets,
5533        cache,
5534        block_start,
5535        block_size,
5536    );
5537    for omp_thread_num in 0..omp_num_threads {
5538        let omp_block_start = omp_thread_num * omp_block_stride;
5539        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
5540            omp_block_stride
5541        } else {
5542            block_size_usize - omp_block_start
5543        };
5544        if omp_block_size > 0 {
5545            compact_and_place_cached_suffixes(
5546                sa,
5547                &mut cache[omp_block_start..],
5548                0,
5549                omp_block_size as FastSint,
5550            );
5551        }
5552    }
5553}
5554
5555/// Internal helper: partial sorting scan right to left 32s 6k (OpenMP variant).
5556#[doc(hidden)]
5557pub fn partial_sorting_scan_right_to_left_32s_6k_omp(
5558    t: &[SaSint],
5559    sa: &mut [SaSint],
5560    n: SaSint,
5561    buckets: &mut [SaSint],
5562    first_lms_suffix: SaSint,
5563    left_suffixes_count: SaSint,
5564    mut d: SaSint,
5565    threads: SaSint,
5566    thread_state: &mut [ThreadState],
5567) -> SaSint {
5568    let scan_start = left_suffixes_count as FastSint + 1;
5569    let scan_end = n as FastSint - first_lms_suffix as FastSint;
5570    if threads == 1 || (scan_end - scan_start) < 65_536 {
5571        return partial_sorting_scan_right_to_left_32s_6k(
5572            t,
5573            sa,
5574            buckets,
5575            d,
5576            scan_start,
5577            scan_end - scan_start,
5578        );
5579    }
5580    if thread_state.is_empty() {
5581        return partial_sorting_scan_right_to_left_32s_6k(
5582            t,
5583            sa,
5584            buckets,
5585            d,
5586            scan_start,
5587            scan_end - scan_start,
5588        );
5589    }
5590
5591    let threads_usize = usize::try_from(threads)
5592        .expect("threads must be non-negative")
5593        .max(1);
5594    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5595    let mut block_start = scan_end - 1;
5596    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5597        .expect("block span must fit FastSint");
5598    while block_start >= scan_start {
5599        let mut block_end = block_start - block_span;
5600        if block_end < scan_start {
5601            block_end = scan_start - 1;
5602        }
5603
5604        d = partial_sorting_scan_right_to_left_32s_6k_block_omp(
5605            t,
5606            sa,
5607            buckets,
5608            d,
5609            &mut cache,
5610            block_end + 1,
5611            block_start - block_end,
5612            threads,
5613        );
5614
5615        if block_end < scan_start {
5616            break;
5617        }
5618        block_start = block_end;
5619    }
5620
5621    d
5622}
5623
5624/// Internal helper: partial sorting scan right to left 32s 4k (OpenMP variant).
5625#[doc(hidden)]
5626pub fn partial_sorting_scan_right_to_left_32s_4k_omp(
5627    t: &[SaSint],
5628    sa: &mut [SaSint],
5629    n: SaSint,
5630    k: SaSint,
5631    buckets: &mut [SaSint],
5632    mut d: SaSint,
5633    threads: SaSint,
5634    thread_state: &mut [ThreadState],
5635) -> SaSint {
5636    if threads == 1 || n < 65_536 {
5637        return partial_sorting_scan_right_to_left_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
5638    }
5639    if thread_state.is_empty() {
5640        return partial_sorting_scan_right_to_left_32s_4k(t, sa, k, buckets, d, 0, n as FastSint);
5641    }
5642    let threads_usize = usize::try_from(threads)
5643        .expect("threads must be non-negative")
5644        .max(1);
5645    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5646    let mut block_start = FastSint::try_from(n).expect("n must fit FastSint") - 1;
5647    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5648        .expect("block span must fit FastSint");
5649    while block_start >= 0 {
5650        let mut block_end = block_start - block_span;
5651        if block_end < 0 {
5652            block_end = -1;
5653        }
5654
5655        d = partial_sorting_scan_right_to_left_32s_4k_block_omp(
5656            t,
5657            sa,
5658            k,
5659            buckets,
5660            d,
5661            &mut cache,
5662            block_end + 1,
5663            block_start - block_end,
5664            threads,
5665        );
5666
5667        if block_end < 0 {
5668            break;
5669        }
5670        block_start = block_end;
5671    }
5672
5673    d
5674}
5675
5676/// Internal helper: partial sorting scan right to left 32s 1k (OpenMP variant).
5677#[doc(hidden)]
5678pub fn partial_sorting_scan_right_to_left_32s_1k_omp(
5679    t: &[SaSint],
5680    sa: &mut [SaSint],
5681    n: SaSint,
5682    buckets: &mut [SaSint],
5683    threads: SaSint,
5684    thread_state: &mut [ThreadState],
5685) {
5686    if threads == 1 || n < 65_536 {
5687        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, 0, n as FastSint);
5688        return;
5689    }
5690    if thread_state.is_empty() {
5691        partial_sorting_scan_right_to_left_32s_1k(t, sa, buckets, 0, n as FastSint);
5692        return;
5693    }
5694
5695    let threads_usize = usize::try_from(threads)
5696        .expect("threads must be non-negative")
5697        .max(1);
5698    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
5699    let mut block_start = FastSint::try_from(n).expect("n must fit FastSint") - 1;
5700    let block_span = FastSint::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
5701        .expect("block span must fit FastSint");
5702    while block_start >= 0 {
5703        let mut block_end = block_start - block_span;
5704        if block_end < 0 {
5705            block_end = -1;
5706        }
5707
5708        partial_sorting_scan_right_to_left_32s_1k_block_omp(
5709            t,
5710            sa,
5711            buckets,
5712            &mut cache,
5713            block_end + 1,
5714            block_start - block_end,
5715            threads,
5716        );
5717
5718        if block_end < 0 {
5719            break;
5720        }
5721        block_start = block_end;
5722    }
5723}
5724
5725/// Internal helper: partial sorting gather lms suffixes 32s 4k.
5726#[doc(hidden)]
5727pub fn partial_sorting_gather_lms_suffixes_32s_4k(
5728    sa: &mut [SaSint],
5729    omp_block_start: FastSint,
5730    omp_block_size: FastSint,
5731) -> FastSint {
5732    if omp_block_size <= 0 {
5733        return omp_block_start;
5734    }
5735
5736    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5737    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5738    let mut l = start;
5739
5740    for i in start..start + size {
5741        let s = sa[i] as SaUint;
5742        sa[l] = ((s.wrapping_sub(SUFFIX_GROUP_MARKER as SaUint)) & !(SUFFIX_GROUP_MARKER as SaUint))
5743            as SaSint;
5744        l += usize::from((s as SaSint) < 0);
5745    }
5746
5747    l as FastSint
5748}
5749
5750/// Internal helper: partial sorting gather lms suffixes 32s 1k.
5751#[doc(hidden)]
5752pub fn partial_sorting_gather_lms_suffixes_32s_1k(
5753    sa: &mut [SaSint],
5754    omp_block_start: FastSint,
5755    omp_block_size: FastSint,
5756) -> FastSint {
5757    if omp_block_size <= 0 {
5758        return omp_block_start;
5759    }
5760
5761    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
5762    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
5763    let mut l = start;
5764
5765    for i in start..start + size {
5766        let s = sa[i];
5767        sa[l] = s & SAINT_MAX;
5768        l += usize::from(s < 0);
5769    }
5770
5771    l as FastSint
5772}
5773
5774/// Internal helper: partial sorting gather lms suffixes 32s 4k (OpenMP variant).
5775#[doc(hidden)]
5776pub fn partial_sorting_gather_lms_suffixes_32s_4k_omp(
5777    sa: &mut [SaSint],
5778    n: SaSint,
5779    threads: SaSint,
5780    thread_state: &mut [ThreadState],
5781) {
5782    let n_usize = usize::try_from(n).expect("n must be non-negative");
5783    let omp_num_threads = if threads > 1 && n >= 65_536 {
5784        usize::try_from(threads)
5785            .expect("threads must be non-negative")
5786            .min(thread_state.len())
5787            .max(1)
5788    } else {
5789        1
5790    };
5791
5792    if omp_num_threads == 1 {
5793        let _ = partial_sorting_gather_lms_suffixes_32s_4k(sa, 0, n as FastSint);
5794        return;
5795    }
5796
5797    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
5798    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
5799        let block_start = thread_num * omp_block_stride;
5800        let block_size = if thread_num + 1 < omp_num_threads {
5801            omp_block_stride
5802        } else {
5803            n_usize - block_start
5804        };
5805        state.position = block_start as FastSint;
5806        state.count = partial_sorting_gather_lms_suffixes_32s_4k(
5807            sa,
5808            block_start as FastSint,
5809            block_size as FastSint,
5810        ) - block_start as FastSint;
5811    }
5812
5813    let mut position = 0usize;
5814    for (thread_num, state) in thread_state.iter().take(omp_num_threads).enumerate() {
5815        let count = usize::try_from(state.count).expect("count must be non-negative");
5816        let src = usize::try_from(state.position).expect("position must be non-negative");
5817        if thread_num > 0 && count > 0 {
5818            sa.copy_within(src..src + count, position);
5819        }
5820        position += count;
5821    }
5822}
5823
5824/// Internal helper: partial sorting gather lms suffixes 32s 1k (OpenMP variant).
5825#[doc(hidden)]
5826pub fn partial_sorting_gather_lms_suffixes_32s_1k_omp(
5827    sa: &mut [SaSint],
5828    n: SaSint,
5829    threads: SaSint,
5830    thread_state: &mut [ThreadState],
5831) {
5832    let n_usize = usize::try_from(n).expect("n must be non-negative");
5833    let omp_num_threads = if threads > 1 && n >= 65_536 {
5834        usize::try_from(threads)
5835            .expect("threads must be non-negative")
5836            .min(thread_state.len())
5837            .max(1)
5838    } else {
5839        1
5840    };
5841
5842    if omp_num_threads == 1 {
5843        let _ = partial_sorting_gather_lms_suffixes_32s_1k(sa, 0, n as FastSint);
5844        return;
5845    }
5846
5847    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
5848    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
5849        let block_start = thread_num * omp_block_stride;
5850        let block_size = if thread_num + 1 < omp_num_threads {
5851            omp_block_stride
5852        } else {
5853            n_usize - block_start
5854        };
5855        state.position = block_start as FastSint;
5856        state.count = partial_sorting_gather_lms_suffixes_32s_1k(
5857            sa,
5858            block_start as FastSint,
5859            block_size as FastSint,
5860        ) - block_start as FastSint;
5861    }
5862
5863    let mut position = 0usize;
5864    for (thread_num, state) in thread_state.iter().take(omp_num_threads).enumerate() {
5865        let count = usize::try_from(state.count).expect("count must be non-negative");
5866        let src = usize::try_from(state.position).expect("position must be non-negative");
5867        if thread_num > 0 && count > 0 {
5868            sa.copy_within(src..src + count, position);
5869        }
5870        position += count;
5871    }
5872}
5873
5874/// Internal helper: induce partial order 8u (OpenMP variant).
5875#[doc(hidden)]
5876pub fn induce_partial_order_8u_omp(
5877    t: &[u8],
5878    sa: &mut [SaSint],
5879    n: SaSint,
5880    k: SaSint,
5881    flags: SaSint,
5882    buckets: &mut [SaSint],
5883    first_lms_suffix: SaSint,
5884    left_suffixes_count: SaSint,
5885    threads: SaSint,
5886    thread_state: &mut [ThreadState],
5887) {
5888    buckets[2 * ALPHABET_SIZE..4 * ALPHABET_SIZE].fill(0);
5889
5890    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
5891        let left = 4 * ALPHABET_SIZE + buckets_index2(0, 1);
5892        let right = 4 * ALPHABET_SIZE + buckets_index2(1, 1);
5893        buckets[left] = buckets[right] - 1;
5894        flip_suffix_markers_omp(sa, buckets[left], threads);
5895    }
5896
5897    let d = partial_sorting_scan_left_to_right_8u_omp(
5898        t,
5899        sa,
5900        n,
5901        k,
5902        buckets,
5903        left_suffixes_count,
5904        0,
5905        threads,
5906        thread_state,
5907    );
5908    partial_sorting_shift_markers_8u_omp(sa, n, buckets, threads);
5909
5910    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
5911        partial_gsa_scan_right_to_left_8u_omp(
5912            t,
5913            sa,
5914            n,
5915            k,
5916            buckets,
5917            first_lms_suffix,
5918            left_suffixes_count,
5919            d,
5920            threads,
5921            thread_state,
5922        );
5923
5924        if t[usize::try_from(first_lms_suffix).expect("first_lms_suffix must be non-negative")] == 0
5925        {
5926            let count = usize::try_from(buckets[buckets_index2(1, 1)] - 1)
5927                .expect("count must be non-negative");
5928            sa.copy_within(0..count, 1);
5929            sa[0] = first_lms_suffix | SAINT_MIN;
5930        }
5931
5932        buckets[buckets_index2(0, 1)] = 0;
5933    } else {
5934        partial_sorting_scan_right_to_left_8u_omp(
5935            t,
5936            sa,
5937            n,
5938            k,
5939            buckets,
5940            first_lms_suffix,
5941            left_suffixes_count,
5942            d,
5943            threads,
5944            thread_state,
5945        );
5946    }
5947}
5948
5949/// Internal helper: induce partial order 32s 6k (OpenMP variant).
5950#[doc(hidden)]
5951pub fn induce_partial_order_32s_6k_omp(
5952    t: &[SaSint],
5953    sa: &mut [SaSint],
5954    n: SaSint,
5955    k: SaSint,
5956    buckets: &mut [SaSint],
5957    first_lms_suffix: SaSint,
5958    left_suffixes_count: SaSint,
5959    threads: SaSint,
5960    thread_state: &mut [ThreadState],
5961) {
5962    let d = partial_sorting_scan_left_to_right_32s_6k_omp(
5963        t,
5964        sa,
5965        n,
5966        buckets,
5967        left_suffixes_count,
5968        0,
5969        threads,
5970        thread_state,
5971    );
5972    partial_sorting_shift_markers_32s_6k_omp(sa, k, buckets, threads);
5973    partial_sorting_shift_buckets_32s_6k(k, buckets);
5974    let _ = partial_sorting_scan_right_to_left_32s_6k_omp(
5975        t,
5976        sa,
5977        n,
5978        buckets,
5979        first_lms_suffix,
5980        left_suffixes_count,
5981        d,
5982        threads,
5983        thread_state,
5984    );
5985}
5986
5987/// Internal helper: induce partial order 32s 4k (OpenMP variant).
5988#[doc(hidden)]
5989pub fn induce_partial_order_32s_4k_omp(
5990    t: &[SaSint],
5991    sa: &mut [SaSint],
5992    n: SaSint,
5993    k: SaSint,
5994    buckets: &mut [SaSint],
5995    threads: SaSint,
5996    thread_state: &mut [ThreadState],
5997) {
5998    let zero_len = 2 * usize::try_from(k).expect("k must be non-negative");
5999    buckets[..zero_len].fill(0);
6000
6001    let d = partial_sorting_scan_left_to_right_32s_4k_omp(
6002        t,
6003        sa,
6004        n,
6005        k,
6006        buckets,
6007        0,
6008        threads,
6009        thread_state,
6010    );
6011    partial_sorting_shift_markers_32s_4k(sa, n);
6012    let _ = partial_sorting_scan_right_to_left_32s_4k_omp(
6013        t,
6014        sa,
6015        n,
6016        k,
6017        buckets,
6018        d,
6019        threads,
6020        thread_state,
6021    );
6022    partial_sorting_gather_lms_suffixes_32s_4k_omp(sa, n, threads, thread_state);
6023}
6024
6025/// Internal helper: induce partial order 32s 2k (OpenMP variant).
6026#[doc(hidden)]
6027pub fn induce_partial_order_32s_2k_omp(
6028    t: &[SaSint],
6029    sa: &mut [SaSint],
6030    n: SaSint,
6031    k: SaSint,
6032    buckets: &mut [SaSint],
6033    threads: SaSint,
6034    thread_state: &mut [ThreadState],
6035) {
6036    let k_usize = usize::try_from(k).expect("k must be non-negative");
6037    let (left, right) = buckets.split_at_mut(k_usize);
6038    partial_sorting_scan_left_to_right_32s_1k_omp(t, sa, n, right, threads, thread_state);
6039    partial_sorting_scan_right_to_left_32s_1k_omp(t, sa, n, left, threads, thread_state);
6040    partial_sorting_gather_lms_suffixes_32s_1k_omp(sa, n, threads, thread_state);
6041}
6042
6043/// Internal helper: induce partial order 32s 1k (OpenMP variant).
6044#[doc(hidden)]
6045pub fn induce_partial_order_32s_1k_omp(
6046    t: &[SaSint],
6047    sa: &mut [SaSint],
6048    n: SaSint,
6049    k: SaSint,
6050    buckets: &mut [SaSint],
6051    threads: SaSint,
6052    thread_state: &mut [ThreadState],
6053) {
6054    count_suffixes_32s(t, n, k, buckets);
6055    initialize_buckets_start_32s_1k(k, buckets);
6056    partial_sorting_scan_left_to_right_32s_1k_omp(t, sa, n, buckets, threads, thread_state);
6057
6058    count_suffixes_32s(t, n, k, buckets);
6059    initialize_buckets_end_32s_1k(k, buckets);
6060    partial_sorting_scan_right_to_left_32s_1k_omp(t, sa, n, buckets, threads, thread_state);
6061
6062    partial_sorting_gather_lms_suffixes_32s_1k_omp(sa, n, threads, thread_state);
6063}
6064
6065/// Internal helper: renumber lms suffixes 8u.
6066#[doc(hidden)]
6067pub fn renumber_lms_suffixes_8u(
6068    sa: &mut [SaSint],
6069    m: SaSint,
6070    mut name: SaSint,
6071    omp_block_start: FastSint,
6072    omp_block_size: FastSint,
6073) -> SaSint {
6074    if omp_block_size <= 0 {
6075        return name;
6076    }
6077
6078    let m_usize = usize::try_from(m).expect("m must be non-negative");
6079    let (sa_head, sam) = sa.split_at_mut(m_usize);
6080    let mut i = omp_block_start;
6081    let mut j = omp_block_start + omp_block_size - 64 - 3;
6082
6083    while i < j {
6084        let i0 = i as usize;
6085        let p0 = sa_head[i0];
6086        let d0 = ((p0 & SAINT_MAX) >> 1) as usize;
6087        sam[d0] = name | SAINT_MIN;
6088        name += SaSint::from(p0 < 0);
6089
6090        let p1 = sa_head[i0 + 1];
6091        let d1 = ((p1 & SAINT_MAX) >> 1) as usize;
6092        sam[d1] = name | SAINT_MIN;
6093        name += SaSint::from(p1 < 0);
6094
6095        let p2 = sa_head[i0 + 2];
6096        let d2 = ((p2 & SAINT_MAX) >> 1) as usize;
6097        sam[d2] = name | SAINT_MIN;
6098        name += SaSint::from(p2 < 0);
6099
6100        let p3 = sa_head[i0 + 3];
6101        let d3 = ((p3 & SAINT_MAX) >> 1) as usize;
6102        sam[d3] = name | SAINT_MIN;
6103        name += SaSint::from(p3 < 0);
6104
6105        i += 4;
6106    }
6107
6108    j += 64 + 3;
6109    while i < j {
6110        let p = sa_head[i as usize];
6111        let d = ((p & SAINT_MAX) >> 1) as usize;
6112        sam[d] = name | SAINT_MIN;
6113        name += SaSint::from(p < 0);
6114        i += 1;
6115    }
6116
6117    name
6118}
6119
6120/// Internal helper: gather marked lms suffixes.
6121#[doc(hidden)]
6122pub fn gather_marked_lms_suffixes(
6123    sa: &mut [SaSint],
6124    m: SaSint,
6125    l: FastSint,
6126    omp_block_start: FastSint,
6127    omp_block_size: FastSint,
6128) -> FastSint {
6129    if omp_block_size <= 0 {
6130        return l;
6131    }
6132
6133    let mut l = l - 1;
6134    let mut i = m as FastSint + omp_block_start + omp_block_size - 1;
6135    let mut j = m as FastSint + omp_block_start + 3;
6136
6137    while i >= j {
6138        let i0 = i as usize;
6139        let s0 = sa[i0];
6140        sa[l as usize] = s0 & SAINT_MAX;
6141        l -= FastSint::from(s0 < 0);
6142
6143        let s1 = sa[i0 - 1];
6144        sa[l as usize] = s1 & SAINT_MAX;
6145        l -= FastSint::from(s1 < 0);
6146
6147        let s2 = sa[i0 - 2];
6148        sa[l as usize] = s2 & SAINT_MAX;
6149        l -= FastSint::from(s2 < 0);
6150
6151        let s3 = sa[i0 - 3];
6152        sa[l as usize] = s3 & SAINT_MAX;
6153        l -= FastSint::from(s3 < 0);
6154
6155        i -= 4;
6156    }
6157
6158    j -= 3;
6159    while i >= j {
6160        let s = sa[i as usize];
6161        sa[l as usize] = s & SAINT_MAX;
6162        l -= FastSint::from(s < 0);
6163        i -= 1;
6164    }
6165
6166    l + 1
6167}
6168
6169/// Internal helper: renumber lms suffixes 8u (OpenMP variant).
6170#[doc(hidden)]
6171pub fn renumber_lms_suffixes_8u_omp(
6172    sa: &mut [SaSint],
6173    m: SaSint,
6174    threads: SaSint,
6175    thread_state: &mut [ThreadState],
6176) -> SaSint {
6177    let mut name = 0;
6178    let omp_num_threads = if threads > 1 && m >= 65_536 {
6179        usize::try_from(threads)
6180            .expect("threads must be non-negative")
6181            .min(thread_state.len())
6182            .max(1)
6183    } else {
6184        1
6185    };
6186    let omp_block_stride = (m as FastSint / omp_num_threads as FastSint) & !15;
6187
6188    if omp_num_threads == 1 {
6189        name = renumber_lms_suffixes_8u(sa, m, 0, 0, m as FastSint);
6190    } else {
6191        for omp_thread_num in 0..omp_num_threads {
6192            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6193            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6194                omp_block_stride
6195            } else {
6196                m as FastSint - omp_block_start
6197            };
6198            thread_state[omp_thread_num].count =
6199                count_negative_marked_suffixes(sa, omp_block_start, omp_block_size) as FastSint;
6200        }
6201
6202        for omp_thread_num in 0..omp_num_threads {
6203            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6204            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6205                omp_block_stride
6206            } else {
6207                m as FastSint - omp_block_start
6208            };
6209
6210            let mut count: FastSint = 0;
6211            for t in 0..omp_thread_num {
6212                count += thread_state[t].count;
6213            }
6214
6215            if omp_thread_num + 1 == omp_num_threads {
6216                name = (count + thread_state[omp_thread_num].count) as SaSint;
6217            }
6218
6219            let _ =
6220                renumber_lms_suffixes_8u(sa, m, count as SaSint, omp_block_start, omp_block_size);
6221        }
6222    }
6223
6224    name
6225}
6226
6227/// Internal helper: gather marked lms suffixes (OpenMP variant).
6228#[doc(hidden)]
6229pub fn gather_marked_lms_suffixes_omp(
6230    sa: &mut [SaSint],
6231    n: SaSint,
6232    m: SaSint,
6233    fs: SaSint,
6234    threads: SaSint,
6235    thread_state: &mut [ThreadState],
6236) {
6237    let n_fast = n as FastSint;
6238    let m_fast = m as FastSint;
6239    let omp_num_threads = if threads > 1 && n >= 131_072 {
6240        usize::try_from(threads)
6241            .expect("threads must be non-negative")
6242            .min(thread_state.len())
6243            .max(1)
6244    } else {
6245        1
6246    };
6247    let omp_block_stride = ((n_fast >> 1) / omp_num_threads as FastSint) & !15;
6248
6249    if omp_num_threads == 1 {
6250        let _ = gather_marked_lms_suffixes(sa, m, n_fast + fs as FastSint, 0, n_fast >> 1);
6251    } else {
6252        for omp_thread_num in 0..omp_num_threads {
6253            let omp_block_start = omp_thread_num as FastSint * omp_block_stride;
6254            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6255                omp_block_stride
6256            } else {
6257                (n_fast >> 1) - omp_block_start
6258            };
6259
6260            if omp_thread_num < omp_num_threads - 1 {
6261                thread_state[omp_thread_num].position = gather_marked_lms_suffixes(
6262                    sa,
6263                    m,
6264                    m_fast + omp_block_start + omp_block_size,
6265                    omp_block_start,
6266                    omp_block_size,
6267                );
6268                thread_state[omp_thread_num].count = m_fast + omp_block_start + omp_block_size
6269                    - thread_state[omp_thread_num].position;
6270            } else {
6271                thread_state[omp_thread_num].position = gather_marked_lms_suffixes(
6272                    sa,
6273                    m,
6274                    n_fast + fs as FastSint,
6275                    omp_block_start,
6276                    omp_block_size,
6277                );
6278                thread_state[omp_thread_num].count =
6279                    n_fast + fs as FastSint - thread_state[omp_thread_num].position;
6280            }
6281        }
6282
6283        let mut position = n_fast + fs as FastSint;
6284        for t in (0..omp_num_threads).rev() {
6285            position -= thread_state[t].count;
6286            if t + 1 != omp_num_threads && thread_state[t].count > 0 {
6287                let src = usize::try_from(thread_state[t].position)
6288                    .expect("position must be non-negative");
6289                let len =
6290                    usize::try_from(thread_state[t].count).expect("count must be non-negative");
6291                let dst = usize::try_from(position).expect("position must be non-negative");
6292                sa.copy_within(src..src + len, dst);
6293            }
6294        }
6295    }
6296}
6297
6298/// Internal helper: renumber and gather lms suffixes (OpenMP variant).
6299#[doc(hidden)]
6300pub fn renumber_and_gather_lms_suffixes_omp(
6301    sa: &mut [SaSint],
6302    n: SaSint,
6303    m: SaSint,
6304    fs: SaSint,
6305    threads: SaSint,
6306    thread_state: &mut [ThreadState],
6307) -> SaSint {
6308    let m_usize = usize::try_from(m).expect("m must be non-negative");
6309    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6310    sa[m_usize..m_usize + half_n].fill(0);
6311
6312    let name = renumber_lms_suffixes_8u_omp(sa, m, threads, thread_state);
6313    if name < m {
6314        gather_marked_lms_suffixes_omp(sa, n, m, fs, threads, thread_state);
6315    } else {
6316        let mut i = 0;
6317        while i < m_usize {
6318            sa[i] &= SAINT_MAX;
6319            i += 1;
6320        }
6321    }
6322
6323    name
6324}
6325
6326/// Internal helper: renumber distinct lms suffixes 32s 4k.
6327#[doc(hidden)]
6328pub fn renumber_distinct_lms_suffixes_32s_4k(
6329    sa: &mut [SaSint],
6330    m: SaSint,
6331    mut name: SaSint,
6332    omp_block_start: FastSint,
6333    omp_block_size: FastSint,
6334) -> SaSint {
6335    if omp_block_size <= 0 {
6336        return name;
6337    }
6338
6339    let prefetch_distance = 64usize;
6340    let m_usize = usize::try_from(m).expect("m must be non-negative");
6341    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6342    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6343    let (sa_head, sam) = sa.split_at_mut(m_usize);
6344    let mut i = start;
6345    let mut j = start
6346        .saturating_add(size)
6347        .saturating_sub(prefetch_distance + 3);
6348    let mut p0;
6349    let mut p1;
6350    let mut p2;
6351    let mut p3 = 0;
6352
6353    while i < j {
6354        p0 = sa_head[i];
6355        sa_head[i] = p0 & SAINT_MAX;
6356        sam[(sa_head[i] >> 1) as usize] = name | (p0 & p3 & SAINT_MIN);
6357        name += SaSint::from(p0 < 0);
6358
6359        p1 = sa_head[i + 1];
6360        sa_head[i + 1] = p1 & SAINT_MAX;
6361        sam[(sa_head[i + 1] >> 1) as usize] = name | (p1 & p0 & SAINT_MIN);
6362        name += SaSint::from(p1 < 0);
6363
6364        p2 = sa_head[i + 2];
6365        sa_head[i + 2] = p2 & SAINT_MAX;
6366        sam[(sa_head[i + 2] >> 1) as usize] = name | (p2 & p1 & SAINT_MIN);
6367        name += SaSint::from(p2 < 0);
6368
6369        p3 = sa_head[i + 3];
6370        sa_head[i + 3] = p3 & SAINT_MAX;
6371        sam[(sa_head[i + 3] >> 1) as usize] = name | (p3 & p2 & SAINT_MIN);
6372        name += SaSint::from(p3 < 0);
6373
6374        i += 4;
6375    }
6376
6377    j = start + size;
6378    while i < j {
6379        p2 = p3;
6380        p3 = sa_head[i];
6381        sa_head[i] = p3 & SAINT_MAX;
6382        sam[(sa_head[i] >> 1) as usize] = name | (p3 & p2 & SAINT_MIN);
6383        name += SaSint::from(p3 < 0);
6384        i += 1;
6385    }
6386
6387    name
6388}
6389
6390/// Internal helper: mark distinct lms suffixes 32s.
6391#[doc(hidden)]
6392pub fn mark_distinct_lms_suffixes_32s(
6393    sa: &mut [SaSint],
6394    m: SaSint,
6395    omp_block_start: FastSint,
6396    omp_block_size: FastSint,
6397) {
6398    if omp_block_size <= 0 {
6399        return;
6400    }
6401
6402    let m_usize = usize::try_from(m).expect("m must be non-negative");
6403    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6404    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6405    let mut i = m_usize + start;
6406    let mut j = m_usize + start + size.saturating_sub(3);
6407    let mut p3 = 0;
6408
6409    while i < j {
6410        let mut p0 = sa[i];
6411        sa[i] = p0 & (p3 | SAINT_MAX);
6412        p0 = if p0 == 0 { p3 } else { p0 };
6413
6414        let mut p1 = sa[i + 1];
6415        sa[i + 1] = p1 & (p0 | SAINT_MAX);
6416        p1 = if p1 == 0 { p0 } else { p1 };
6417
6418        let mut p2 = sa[i + 2];
6419        sa[i + 2] = p2 & (p1 | SAINT_MAX);
6420        p2 = if p2 == 0 { p1 } else { p2 };
6421
6422        p3 = sa[i + 3];
6423        sa[i + 3] = p3 & (p2 | SAINT_MAX);
6424        p3 = if p3 == 0 { p2 } else { p3 };
6425
6426        i += 4;
6427    }
6428
6429    j = m_usize + start + size;
6430    while i < j {
6431        let p2 = p3;
6432        p3 = sa[i];
6433        sa[i] = p3 & (p2 | SAINT_MAX);
6434        p3 = if p3 == 0 { p2 } else { p3 };
6435        i += 1;
6436    }
6437}
6438
6439/// Internal helper: clamp lms suffixes length 32s.
6440#[doc(hidden)]
6441pub fn clamp_lms_suffixes_length_32s(
6442    sa: &mut [SaSint],
6443    m: SaSint,
6444    omp_block_start: FastSint,
6445    omp_block_size: FastSint,
6446) {
6447    if omp_block_size <= 0 {
6448        return;
6449    }
6450
6451    let m_usize = usize::try_from(m).expect("m must be non-negative");
6452    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
6453    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
6454    let mut i = m_usize + start;
6455    let mut j = m_usize + start + size.saturating_sub(3);
6456
6457    while i < j {
6458        let s0 = sa[i];
6459        sa[i] = if s0 < 0 { s0 } else { 0 } & SAINT_MAX;
6460
6461        let s1 = sa[i + 1];
6462        sa[i + 1] = if s1 < 0 { s1 } else { 0 } & SAINT_MAX;
6463
6464        let s2 = sa[i + 2];
6465        sa[i + 2] = if s2 < 0 { s2 } else { 0 } & SAINT_MAX;
6466
6467        let s3 = sa[i + 3];
6468        sa[i + 3] = if s3 < 0 { s3 } else { 0 } & SAINT_MAX;
6469
6470        i += 4;
6471    }
6472
6473    j = m_usize + start + size;
6474    while i < j {
6475        let s = sa[i];
6476        sa[i] = if s < 0 { s } else { 0 } & SAINT_MAX;
6477        i += 1;
6478    }
6479}
6480
6481/// Internal helper: renumber distinct lms suffixes 32s 4k (OpenMP variant).
6482#[doc(hidden)]
6483pub fn renumber_distinct_lms_suffixes_32s_4k_omp(
6484    sa: &mut [SaSint],
6485    m: SaSint,
6486    threads: SaSint,
6487    thread_state: &mut [ThreadState],
6488) -> SaSint {
6489    let mut name = 0;
6490    let m_usize = usize::try_from(m).expect("m must be non-negative");
6491    let omp_num_threads = if threads > 1 && m >= 65_536 {
6492        usize::try_from(threads)
6493            .expect("threads must be non-negative")
6494            .min(thread_state.len())
6495            .max(1)
6496    } else {
6497        1
6498    };
6499    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
6500
6501    if omp_num_threads == 1 {
6502        let omp_block_start = 0usize;
6503        let omp_block_size = m_usize - omp_block_start;
6504        name = renumber_distinct_lms_suffixes_32s_4k(
6505            sa,
6506            m,
6507            1,
6508            omp_block_start as FastSint,
6509            omp_block_size as FastSint,
6510        );
6511    } else {
6512        for omp_thread_num in 0..omp_num_threads {
6513            let omp_block_start = omp_thread_num * omp_block_stride;
6514            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6515                omp_block_stride
6516            } else {
6517                m_usize - omp_block_start
6518            };
6519            thread_state[omp_thread_num].count = count_negative_marked_suffixes(
6520                sa,
6521                omp_block_start as FastSint,
6522                omp_block_size as FastSint,
6523            ) as FastSint;
6524        }
6525
6526        for omp_thread_num in 0..omp_num_threads {
6527            let omp_block_start = omp_thread_num * omp_block_stride;
6528            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6529                omp_block_stride
6530            } else {
6531                m_usize - omp_block_start
6532            };
6533
6534            let mut count: FastSint = 1;
6535            for t in 0..omp_thread_num {
6536                count += thread_state[t].count;
6537            }
6538
6539            if omp_thread_num + 1 == omp_num_threads {
6540                name = (count + thread_state[omp_thread_num].count) as SaSint;
6541            }
6542
6543            let _ = renumber_distinct_lms_suffixes_32s_4k(
6544                sa,
6545                m,
6546                count as SaSint,
6547                omp_block_start as FastSint,
6548                omp_block_size as FastSint,
6549            );
6550        }
6551    }
6552
6553    name - 1
6554}
6555
6556/// Internal helper: mark distinct lms suffixes 32s (OpenMP variant).
6557#[doc(hidden)]
6558pub fn mark_distinct_lms_suffixes_32s_omp(
6559    sa: &mut [SaSint],
6560    n: SaSint,
6561    m: SaSint,
6562    threads: SaSint,
6563) {
6564    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6565    let omp_num_threads = if threads > 1 && n >= 131_072 {
6566        usize::try_from(threads)
6567            .expect("threads must be non-negative")
6568            .max(1)
6569    } else {
6570        1
6571    };
6572    let omp_block_stride = (half_n / omp_num_threads) & !15usize;
6573
6574    for omp_thread_num in 0..omp_num_threads {
6575        let omp_block_start = omp_thread_num * omp_block_stride;
6576        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6577            omp_block_stride
6578        } else {
6579            half_n - omp_block_start
6580        };
6581        mark_distinct_lms_suffixes_32s(
6582            sa,
6583            m,
6584            omp_block_start as FastSint,
6585            omp_block_size as FastSint,
6586        );
6587    }
6588}
6589
6590/// Internal helper: clamp lms suffixes length 32s (OpenMP variant).
6591#[doc(hidden)]
6592pub fn clamp_lms_suffixes_length_32s_omp(sa: &mut [SaSint], n: SaSint, m: SaSint, threads: SaSint) {
6593    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6594    let omp_num_threads = if threads > 1 && n >= 131_072 {
6595        usize::try_from(threads)
6596            .expect("threads must be non-negative")
6597            .max(1)
6598    } else {
6599        1
6600    };
6601    let omp_block_stride = (half_n / omp_num_threads) & !15usize;
6602
6603    for omp_thread_num in 0..omp_num_threads {
6604        let omp_block_start = omp_thread_num * omp_block_stride;
6605        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6606            omp_block_stride
6607        } else {
6608            half_n - omp_block_start
6609        };
6610        clamp_lms_suffixes_length_32s(
6611            sa,
6612            m,
6613            omp_block_start as FastSint,
6614            omp_block_size as FastSint,
6615        );
6616    }
6617}
6618
6619/// Internal helper: renumber and mark distinct lms suffixes 32s 4k (OpenMP variant).
6620#[doc(hidden)]
6621pub fn renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
6622    sa: &mut [SaSint],
6623    n: SaSint,
6624    m: SaSint,
6625    threads: SaSint,
6626    thread_state: &mut [ThreadState],
6627) -> SaSint {
6628    let m_usize = usize::try_from(m).expect("m must be non-negative");
6629    let half_n = usize::try_from(n >> 1).expect("n must be non-negative");
6630    sa[m_usize..m_usize + half_n].fill(0);
6631
6632    let name = renumber_distinct_lms_suffixes_32s_4k_omp(sa, m, threads, thread_state);
6633    if name < m {
6634        mark_distinct_lms_suffixes_32s_omp(sa, n, m, threads);
6635    }
6636
6637    name
6638}
6639
6640/// Internal helper: renumber and mark distinct lms suffixes 32s 1k (OpenMP variant).
6641#[doc(hidden)]
6642pub fn renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
6643    t: &[SaSint],
6644    sa: &mut [SaSint],
6645    n: SaSint,
6646    m: SaSint,
6647    threads: SaSint,
6648) -> SaSint {
6649    let m_usize = usize::try_from(m).expect("m must be non-negative");
6650    let n_usize = usize::try_from(n).expect("n must be non-negative");
6651
6652    let _ = gather_lms_suffixes_32s(t, sa, n);
6653
6654    let zero_len = n_usize
6655        .checked_sub(m_usize)
6656        .and_then(|v| v.checked_sub(m_usize))
6657        .expect("n must be at least 2*m");
6658    sa[m_usize..m_usize + zero_len].fill(0);
6659
6660    {
6661        let prefetch_distance: FastSint = 64;
6662        let mut i = n as FastSint - m as FastSint;
6663        let mut j = n as FastSint - 1 - prefetch_distance - 3;
6664
6665        while i < j {
6666            let iu = i as usize;
6667            let s0 = (sa[iu] as SaUint >> 1) as usize;
6668            let s1 = (sa[iu + 1] as SaUint >> 1) as usize;
6669            let s2 = (sa[iu + 2] as SaUint >> 1) as usize;
6670            let s3 = (sa[iu + 3] as SaUint >> 1) as usize;
6671
6672            sa[m_usize + s0] = sa[iu + 1] - sa[iu] + 1 + SAINT_MIN;
6673            sa[m_usize + s1] = sa[iu + 2] - sa[iu + 1] + 1 + SAINT_MIN;
6674            sa[m_usize + s2] = sa[iu + 3] - sa[iu + 2] + 1 + SAINT_MIN;
6675            sa[m_usize + s3] = sa[iu + 4] - sa[iu + 3] + 1 + SAINT_MIN;
6676            i += 4;
6677        }
6678
6679        j += prefetch_distance + 3;
6680        while i < j {
6681            let iu = i as usize;
6682            let s = (sa[iu] as SaUint >> 1) as usize;
6683            sa[m_usize + s] = sa[iu + 1] - sa[iu] + 1 + SAINT_MIN;
6684            i += 1;
6685        }
6686
6687        let tail = (sa[n_usize - 1] as SaUint >> 1) as usize;
6688        sa[m_usize + tail] = 1 + SAINT_MIN;
6689    }
6690
6691    clamp_lms_suffixes_length_32s_omp(sa, n, m, threads);
6692
6693    let mut name = 1;
6694    if m_usize > 0 {
6695        let (sa_head, sam) = sa.split_at_mut(m_usize);
6696        let mut i = 1usize;
6697        let prefetch_distance = 64usize;
6698        let mut j = m_usize.saturating_sub(prefetch_distance + 1);
6699        let mut p = usize::try_from(sa_head[0]).expect("suffix index must be non-negative");
6700        let mut plen = sam[p >> 1];
6701        let mut pdiff = SAINT_MIN;
6702
6703        while i < j {
6704            let q = usize::try_from(sa_head[i]).expect("suffix index must be non-negative");
6705            let qlen = sam[q >> 1];
6706            let mut qdiff = SAINT_MIN;
6707            if plen == qlen {
6708                let mut l = 0usize;
6709                while l < qlen as usize {
6710                    if t[p + l] != t[q + l] {
6711                        break;
6712                    }
6713                    l += 1;
6714                }
6715                qdiff = ((l as SaSint) - qlen) & SAINT_MIN;
6716            }
6717            sam[p >> 1] = name | (pdiff & qdiff);
6718            name += SaSint::from(qdiff < 0);
6719
6720            p = usize::try_from(sa_head[i + 1]).expect("suffix index must be non-negative");
6721            plen = sam[p >> 1];
6722            pdiff = SAINT_MIN;
6723            if qlen == plen {
6724                let mut l = 0usize;
6725                while l < plen as usize {
6726                    if t[q + l] != t[p + l] {
6727                        break;
6728                    }
6729                    l += 1;
6730                }
6731                pdiff = ((l as SaSint) - plen) & SAINT_MIN;
6732            }
6733            sam[q >> 1] = name | (qdiff & pdiff);
6734            name += SaSint::from(pdiff < 0);
6735            i += 2;
6736        }
6737
6738        j = m_usize;
6739        while i < j {
6740            let q = usize::try_from(sa_head[i]).expect("suffix index must be non-negative");
6741            let qlen = sam[q >> 1];
6742            let mut qdiff = SAINT_MIN;
6743            if plen == qlen {
6744                let mut l = 0usize;
6745                while l < plen as usize {
6746                    if t[p + l] != t[q + l] {
6747                        break;
6748                    }
6749                    l += 1;
6750                }
6751                qdiff = ((l as SaSint) - plen) & SAINT_MIN;
6752            }
6753            sam[p >> 1] = name | (pdiff & qdiff);
6754            name += SaSint::from(qdiff < 0);
6755
6756            p = q;
6757            plen = qlen;
6758            pdiff = qdiff;
6759            i += 1;
6760        }
6761
6762        sam[p >> 1] = name | pdiff;
6763        name += 1;
6764    }
6765
6766    if name <= m {
6767        mark_distinct_lms_suffixes_32s_omp(sa, n, m, threads);
6768    }
6769
6770    name - 1
6771}
6772
6773/// Internal helper: reconstruct lms suffixes.
6774#[doc(hidden)]
6775pub fn reconstruct_lms_suffixes(
6776    sa: &mut [SaSint],
6777    n: SaSint,
6778    m: SaSint,
6779    omp_block_start: FastSint,
6780    omp_block_size: FastSint,
6781) {
6782    if omp_block_size <= 0 {
6783        return;
6784    }
6785
6786    let prefetch_distance: FastSint = 64;
6787    let base = (n - m) as usize;
6788    let mut i = omp_block_start;
6789    let mut j = omp_block_start + omp_block_size - prefetch_distance - 3;
6790
6791    while i < j {
6792        let iu = i as usize;
6793        let s0 = sa[iu] as usize;
6794        let s1 = sa[iu + 1] as usize;
6795        let s2 = sa[iu + 2] as usize;
6796        let s3 = sa[iu + 3] as usize;
6797        sa[iu] = sa[base + s0];
6798        sa[iu + 1] = sa[base + s1];
6799        sa[iu + 2] = sa[base + s2];
6800        sa[iu + 3] = sa[base + s3];
6801        i += 4;
6802    }
6803
6804    j += prefetch_distance + 3;
6805    while i < j {
6806        let iu = i as usize;
6807        let s = sa[iu] as usize;
6808        sa[iu] = sa[base + s];
6809        i += 1;
6810    }
6811}
6812
6813/// Internal helper: reconstruct lms suffixes (OpenMP variant).
6814#[doc(hidden)]
6815pub fn reconstruct_lms_suffixes_omp(sa: &mut [SaSint], n: SaSint, m: SaSint, threads: SaSint) {
6816    let m_usize = usize::try_from(m).expect("m must be non-negative");
6817    let omp_num_threads = if threads > 1 && m >= 65_536 {
6818        usize::try_from(threads)
6819            .expect("threads must be non-negative")
6820            .max(1)
6821    } else {
6822        1
6823    };
6824    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
6825
6826    for omp_thread_num in 0..omp_num_threads {
6827        let omp_block_start = omp_thread_num * omp_block_stride;
6828        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
6829            omp_block_stride
6830        } else {
6831            m_usize - omp_block_start
6832        };
6833        reconstruct_lms_suffixes(
6834            sa,
6835            n,
6836            m,
6837            omp_block_start as FastSint,
6838            omp_block_size as FastSint,
6839        );
6840    }
6841}
6842
6843/// Internal helper: place lms suffixes interval 8u.
6844#[doc(hidden)]
6845pub fn place_lms_suffixes_interval_8u(
6846    sa: &mut [SaSint],
6847    n: SaSint,
6848    mut m: SaSint,
6849    flags: SaSint,
6850    buckets: &mut [SaSint],
6851) {
6852    let bucket_end_base = 7 * ALPHABET_SIZE;
6853    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
6854        buckets[bucket_end_base] -= 1;
6855    }
6856
6857    let mut j = usize::try_from(n).expect("n must be non-negative");
6858    for c in (0..ALPHABET_SIZE - 1).rev() {
6859        let l = usize::try_from(
6860            buckets[buckets_index2(c, 1) + buckets_index2(1, 0)] - buckets[buckets_index2(c, 1)],
6861        )
6862        .expect("interval length must be non-negative");
6863        if l > 0 {
6864            let i = usize::try_from(buckets[bucket_end_base + c])
6865                .expect("bucket end must be non-negative");
6866            if j > i {
6867                sa[i..j].fill(0);
6868            }
6869
6870            let new_j = i - l;
6871            let src_end = usize::try_from(m).expect("m must be non-negative");
6872            let src_start = src_end - l;
6873            sa.copy_within(src_start..src_end, new_j);
6874            m -= l as SaSint;
6875            j = new_j;
6876        }
6877    }
6878
6879    sa[..j].fill(0);
6880
6881    if (flags & LIBSAIS_FLAGS_GSA) != 0 {
6882        buckets[bucket_end_base] += 1;
6883    }
6884}
6885
6886/// Internal helper: place lms suffixes interval 32s 4k.
6887#[doc(hidden)]
6888pub fn place_lms_suffixes_interval_32s_4k(
6889    sa: &mut [SaSint],
6890    n: SaSint,
6891    k: SaSint,
6892    mut m: SaSint,
6893    buckets: &[SaSint],
6894) {
6895    let k_usize = usize::try_from(k).expect("k must be non-negative");
6896    let bucket_end = &buckets[3 * k_usize..4 * k_usize];
6897
6898    let mut j = usize::try_from(n).expect("n must be non-negative");
6899    for c in (0..k_usize - 1).rev() {
6900        let l = usize::try_from(
6901            buckets[buckets_index2(c, 1) + buckets_index2(1, 0)] - buckets[buckets_index2(c, 1)],
6902        )
6903        .expect("interval length must be non-negative");
6904        if l > 0 {
6905            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
6906            if j > i {
6907                sa[i..j].fill(0);
6908            }
6909
6910            let new_j = i - l;
6911            let src_end = usize::try_from(m).expect("m must be non-negative");
6912            let src_start = src_end - l;
6913            sa.copy_within(src_start..src_end, new_j);
6914            m -= l as SaSint;
6915            j = new_j;
6916        }
6917    }
6918
6919    sa[..j].fill(0);
6920}
6921
6922/// Internal helper: place lms suffixes interval 32s 2k.
6923#[doc(hidden)]
6924pub fn place_lms_suffixes_interval_32s_2k(
6925    sa: &mut [SaSint],
6926    n: SaSint,
6927    k: SaSint,
6928    mut m: SaSint,
6929    buckets: &[SaSint],
6930) {
6931    let k_usize = usize::try_from(k).expect("k must be non-negative");
6932    let mut j = usize::try_from(n).expect("n must be non-negative");
6933
6934    if k_usize > 1 {
6935        let mut c = buckets_index2(k_usize - 2, 0) as isize;
6936        while c >= buckets_index2(0, 0) as isize {
6937            let c_usize = c as usize;
6938            let l = usize::try_from(
6939                buckets[c_usize + buckets_index2(1, 1)] - buckets[c_usize + buckets_index2(0, 1)],
6940            )
6941            .expect("interval length must be non-negative");
6942            if l > 0 {
6943                let i =
6944                    usize::try_from(buckets[c_usize]).expect("bucket start must be non-negative");
6945                if j > i {
6946                    sa[i..j].fill(0);
6947                }
6948
6949                let new_j = i - l;
6950                let src_end = usize::try_from(m).expect("m must be non-negative");
6951                let src_start = src_end - l;
6952                sa.copy_within(src_start..src_end, new_j);
6953                m -= l as SaSint;
6954                j = new_j;
6955            }
6956            c -= buckets_index2(1, 0) as isize;
6957        }
6958    }
6959
6960    sa[..j].fill(0);
6961}
6962
6963/// Internal helper: place lms suffixes interval 32s 1k.
6964#[doc(hidden)]
6965pub fn place_lms_suffixes_interval_32s_1k(
6966    t: &[SaSint],
6967    sa: &mut [SaSint],
6968    k: SaSint,
6969    m: SaSint,
6970    buckets: &[SaSint],
6971) {
6972    let mut c = k - 1;
6973    let c_usize = usize::try_from(c).expect("k must be positive");
6974    let mut l = usize::try_from(buckets[c_usize]).expect("bucket end must be non-negative");
6975
6976    let m_usize = usize::try_from(m).expect("m must be non-negative");
6977    for i in (0..m_usize).rev() {
6978        let p = usize::try_from(sa[i]).expect("suffix index must be non-negative");
6979        let tp = t[p];
6980        if tp != c {
6981            c = tp;
6982            let bucket = usize::try_from(c).expect("bucket index must be non-negative");
6983            let bucket_pos =
6984                usize::try_from(buckets[bucket]).expect("bucket end must be non-negative");
6985            if l > bucket_pos {
6986                sa[bucket_pos..l].fill(0);
6987            }
6988            l = bucket_pos;
6989        }
6990        l -= 1;
6991        sa[l] = p as SaSint;
6992    }
6993
6994    sa[..l].fill(0);
6995}
6996
6997/// Internal helper: place lms suffixes histogram 32s 6k.
6998#[doc(hidden)]
6999pub fn place_lms_suffixes_histogram_32s_6k(
7000    sa: &mut [SaSint],
7001    n: SaSint,
7002    k: SaSint,
7003    mut m: SaSint,
7004    buckets: &[SaSint],
7005) {
7006    let k_usize = usize::try_from(k).expect("k must be non-negative");
7007    let bucket_end = &buckets[5 * k_usize..6 * k_usize];
7008
7009    let mut j = usize::try_from(n).expect("n must be non-negative");
7010    for c in (0..k_usize - 1).rev() {
7011        let l = usize::try_from(buckets[buckets_index4(c, 1)])
7012            .expect("histogram length must be non-negative");
7013        if l > 0 {
7014            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
7015            if j > i {
7016                sa[i..j].fill(0);
7017            }
7018
7019            let new_j = i - l;
7020            let src_end = usize::try_from(m).expect("m must be non-negative");
7021            let src_start = src_end - l;
7022            sa.copy_within(src_start..src_end, new_j);
7023            m -= l as SaSint;
7024            j = new_j;
7025        }
7026    }
7027
7028    sa[..j].fill(0);
7029}
7030
7031/// Internal helper: place lms suffixes histogram 32s 4k.
7032#[doc(hidden)]
7033pub fn place_lms_suffixes_histogram_32s_4k(
7034    sa: &mut [SaSint],
7035    n: SaSint,
7036    k: SaSint,
7037    mut m: SaSint,
7038    buckets: &[SaSint],
7039) {
7040    let k_usize = usize::try_from(k).expect("k must be non-negative");
7041    let bucket_end = &buckets[3 * k_usize..4 * k_usize];
7042
7043    let mut j = usize::try_from(n).expect("n must be non-negative");
7044    for c in (0..k_usize - 1).rev() {
7045        let l = usize::try_from(buckets[buckets_index2(c, 1)])
7046            .expect("histogram length must be non-negative");
7047        if l > 0 {
7048            let i = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
7049            if j > i {
7050                sa[i..j].fill(0);
7051            }
7052
7053            let new_j = i - l;
7054            let src_end = usize::try_from(m).expect("m must be non-negative");
7055            let src_start = src_end - l;
7056            sa.copy_within(src_start..src_end, new_j);
7057            m -= l as SaSint;
7058            j = new_j;
7059        }
7060    }
7061
7062    sa[..j].fill(0);
7063}
7064
7065/// Internal helper: place lms suffixes histogram 32s 2k.
7066#[doc(hidden)]
7067pub fn place_lms_suffixes_histogram_32s_2k(
7068    sa: &mut [SaSint],
7069    n: SaSint,
7070    k: SaSint,
7071    mut m: SaSint,
7072    buckets: &[SaSint],
7073) {
7074    let k_usize = usize::try_from(k).expect("k must be non-negative");
7075    let mut j = usize::try_from(n).expect("n must be non-negative");
7076
7077    if k_usize > 1 {
7078        let mut c = buckets_index2(k_usize - 2, 0) as isize;
7079        while c >= buckets_index2(0, 0) as isize {
7080            let c_usize = c as usize;
7081            let l = usize::try_from(buckets[c_usize + buckets_index2(0, 1)])
7082                .expect("histogram length must be non-negative");
7083            if l > 0 {
7084                let i =
7085                    usize::try_from(buckets[c_usize]).expect("bucket start must be non-negative");
7086                if j > i {
7087                    sa[i..j].fill(0);
7088                }
7089
7090                let new_j = i - l;
7091                let src_end = usize::try_from(m).expect("m must be non-negative");
7092                let src_start = src_end - l;
7093                sa.copy_within(src_start..src_end, new_j);
7094                m -= l as SaSint;
7095                j = new_j;
7096            }
7097            c -= buckets_index2(1, 0) as isize;
7098        }
7099    }
7100
7101    sa[..j].fill(0);
7102}
7103
7104/// Internal helper: final bwt scan left to right 8u.
7105#[doc(hidden)]
7106pub fn final_bwt_scan_left_to_right_8u(
7107    t: &[u8],
7108    sa: &mut [SaSint],
7109    induction_bucket: &mut [SaSint],
7110    omp_block_start: FastSint,
7111    omp_block_size: FastSint,
7112) {
7113    if omp_block_size <= 0 {
7114        return;
7115    }
7116
7117    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7118    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7119    for i in start..start + size {
7120        let mut p = sa[i];
7121        sa[i] = p & SAINT_MAX;
7122        if p > 0 {
7123            p -= 1;
7124            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7125            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7126            let bucket = t[p_usize] as usize;
7127            let slot = usize::try_from(induction_bucket[bucket])
7128                .expect("bucket slot must be non-negative");
7129            sa[slot] = p
7130                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7131                    << (SAINT_BIT - 1));
7132            induction_bucket[bucket] += 1;
7133        }
7134    }
7135}
7136
7137/// Internal helper: final bwt aux scan left to right 8u.
7138#[doc(hidden)]
7139pub fn final_bwt_aux_scan_left_to_right_8u(
7140    t: &[u8],
7141    sa: &mut [SaSint],
7142    rm: SaSint,
7143    i_out: &mut [SaSint],
7144    induction_bucket: &mut [SaSint],
7145    omp_block_start: FastSint,
7146    omp_block_size: FastSint,
7147) {
7148    if omp_block_size <= 0 {
7149        return;
7150    }
7151
7152    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7153    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7154    for i in start..start + size {
7155        let mut p = sa[i];
7156        sa[i] = p & SAINT_MAX;
7157        if p > 0 {
7158            p -= 1;
7159            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7160            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7161            let bucket = t[p_usize] as usize;
7162            let slot = usize::try_from(induction_bucket[bucket])
7163                .expect("bucket slot must be non-negative");
7164            sa[slot] = p
7165                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7166                    << (SAINT_BIT - 1));
7167            induction_bucket[bucket] += 1;
7168            if (p & rm) == 0 {
7169                let out_idx =
7170                    usize::try_from(p / (rm + 1)).expect("sample index must be non-negative");
7171                i_out[out_idx] = induction_bucket[bucket];
7172            }
7173        }
7174    }
7175}
7176
7177/// Internal helper: final sorting scan left to right 8u.
7178#[doc(hidden)]
7179pub fn final_sorting_scan_left_to_right_8u(
7180    t: &[u8],
7181    sa: &mut [SaSint],
7182    induction_bucket: &mut [SaSint],
7183    omp_block_start: FastSint,
7184    omp_block_size: FastSint,
7185) {
7186    if omp_block_size <= 0 {
7187        return;
7188    }
7189
7190    let prefetch_distance = 64usize;
7191    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7192    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7193
7194    let mut i = start;
7195    let mut j = if size > prefetch_distance + 1 {
7196        start + size - (prefetch_distance + 1)
7197    } else {
7198        start
7199    };
7200    while i < j {
7201        let mut p0 = sa[i];
7202        sa[i] = p0 ^ SAINT_MIN;
7203        if p0 > 0 {
7204            p0 -= 1;
7205            let p0_usize = p0 as usize;
7206            let bucket0 = t[p0_usize] as usize;
7207            let slot0 = induction_bucket[bucket0] as usize;
7208            sa[slot0] = p0
7209                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] < t[p0_usize]) as SaSint)
7210                    << (SAINT_BIT - 1));
7211            induction_bucket[bucket0] += 1;
7212        }
7213
7214        let mut p1 = sa[i + 1];
7215        sa[i + 1] = p1 ^ SAINT_MIN;
7216        if p1 > 0 {
7217            p1 -= 1;
7218            let p1_usize = p1 as usize;
7219            let bucket1 = t[p1_usize] as usize;
7220            let slot1 = induction_bucket[bucket1] as usize;
7221            sa[slot1] = p1
7222                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] < t[p1_usize]) as SaSint)
7223                    << (SAINT_BIT - 1));
7224            induction_bucket[bucket1] += 1;
7225        }
7226
7227        i += 2;
7228    }
7229
7230    j = start + size;
7231    while i < j {
7232        let mut p = sa[i];
7233        sa[i] = p ^ SAINT_MIN;
7234        if p > 0 {
7235            p -= 1;
7236            let p_usize = p as usize;
7237            let bucket = t[p_usize] as usize;
7238            let slot = induction_bucket[bucket] as usize;
7239            sa[slot] = p
7240                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7241                    << (SAINT_BIT - 1));
7242            induction_bucket[bucket] += 1;
7243        }
7244        i += 1;
7245    }
7246}
7247
7248/// Internal helper: final sorting scan left to right 32s.
7249#[doc(hidden)]
7250pub fn final_sorting_scan_left_to_right_32s(
7251    t: &[SaSint],
7252    sa: &mut [SaSint],
7253    induction_bucket: &mut [SaSint],
7254    omp_block_start: FastSint,
7255    omp_block_size: FastSint,
7256) {
7257    if omp_block_size <= 0 {
7258        return;
7259    }
7260
7261    let prefetch_distance: FastSint = 64;
7262    let mut i = omp_block_start;
7263    let mut j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
7264
7265    while i < j {
7266        let i0 = i as usize;
7267        let mut p0 = sa[i0];
7268        sa[i0] = p0 ^ SAINT_MIN;
7269        if p0 > 0 {
7270            p0 -= 1;
7271            let p0u = p0 as usize;
7272            let bucket0 = t[p0u] as usize;
7273            let slot0 = induction_bucket[bucket0] as usize;
7274            sa[slot0] = p0
7275                | ((usize::from(t[p0u - usize::from(p0 > 0)] < t[p0u]) as SaSint)
7276                    << (SAINT_BIT - 1));
7277            induction_bucket[bucket0] += 1;
7278        }
7279
7280        let i1 = (i + 1) as usize;
7281        let mut p1 = sa[i1];
7282        sa[i1] = p1 ^ SAINT_MIN;
7283        if p1 > 0 {
7284            p1 -= 1;
7285            let p1u = p1 as usize;
7286            let bucket1 = t[p1u] as usize;
7287            let slot1 = induction_bucket[bucket1] as usize;
7288            sa[slot1] = p1
7289                | ((usize::from(t[p1u - usize::from(p1 > 0)] < t[p1u]) as SaSint)
7290                    << (SAINT_BIT - 1));
7291            induction_bucket[bucket1] += 1;
7292        }
7293        i += 2;
7294    }
7295
7296    j += 2 * prefetch_distance + 1;
7297    while i < j {
7298        let iu = i as usize;
7299        let mut p = sa[iu];
7300        sa[iu] = p ^ SAINT_MIN;
7301        if p > 0 {
7302            p -= 1;
7303            let pu = p as usize;
7304            let bucket = t[pu] as usize;
7305            let slot = induction_bucket[bucket] as usize;
7306            sa[slot] = p
7307                | ((usize::from(t[pu - usize::from(p > 0)] < t[pu]) as SaSint) << (SAINT_BIT - 1));
7308            induction_bucket[bucket] += 1;
7309        }
7310        i += 1;
7311    }
7312}
7313
7314/// Internal helper: final bwt scan left to right 8u block prepare.
7315#[doc(hidden)]
7316pub fn final_bwt_scan_left_to_right_8u_block_prepare(
7317    t: &[u8],
7318    sa: &mut [SaSint],
7319    k: SaSint,
7320    buckets: &mut [SaSint],
7321    cache: &mut [ThreadCache],
7322    omp_block_start: FastSint,
7323    omp_block_size: FastSint,
7324) -> FastSint {
7325    if omp_block_size <= 0 {
7326        return 0;
7327    }
7328
7329    let k_usize = usize::try_from(k).expect("k must be non-negative");
7330    buckets[..k_usize].fill(0);
7331
7332    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7333    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7334    let mut count = 0usize;
7335    for i in start..start + size {
7336        let mut p = sa[i];
7337        sa[i] = p & SAINT_MAX;
7338        if p > 0 {
7339            p -= 1;
7340            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7341            let symbol = t[p_usize] as usize;
7342            sa[i] = t[p_usize] as SaSint | SAINT_MIN;
7343            buckets[symbol] += 1;
7344            cache[count].symbol = symbol as SaSint;
7345            cache[count].index = p
7346                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7347                    << (SAINT_BIT - 1));
7348            count += 1;
7349        }
7350    }
7351
7352    count as FastSint
7353}
7354
7355/// Internal helper: final sorting scan left to right 8u block prepare.
7356#[doc(hidden)]
7357pub fn final_sorting_scan_left_to_right_8u_block_prepare(
7358    t: &[u8],
7359    sa: &mut [SaSint],
7360    k: SaSint,
7361    buckets: &mut [SaSint],
7362    cache: &mut [ThreadCache],
7363    omp_block_start: FastSint,
7364    omp_block_size: FastSint,
7365) -> FastSint {
7366    if omp_block_size <= 0 {
7367        return 0;
7368    }
7369
7370    let k_usize = usize::try_from(k).expect("k must be non-negative");
7371    buckets[..k_usize].fill(0);
7372
7373    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7374    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7375    let mut count = 0usize;
7376    for i in start..start + size {
7377        let mut p = sa[i];
7378        sa[i] = p ^ SAINT_MIN;
7379        if p > 0 {
7380            p -= 1;
7381            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
7382            let symbol = t[p_usize] as usize;
7383            buckets[symbol] += 1;
7384            cache[count].symbol = symbol as SaSint;
7385            cache[count].index = p
7386                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7387                    << (SAINT_BIT - 1));
7388            count += 1;
7389        }
7390    }
7391
7392    count as FastSint
7393}
7394
7395/// Internal helper: final order scan left to right 8u block place.
7396#[doc(hidden)]
7397pub fn final_order_scan_left_to_right_8u_block_place(
7398    sa: &mut [SaSint],
7399    buckets: &mut [SaSint],
7400    cache: &[ThreadCache],
7401    count: FastSint,
7402) {
7403    if count <= 0 {
7404        return;
7405    }
7406
7407    let count_usize = usize::try_from(count).expect("count must be non-negative");
7408    for entry in &cache[..count_usize] {
7409        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
7410        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
7411        sa[slot] = entry.index;
7412        buckets[symbol] += 1;
7413    }
7414}
7415
7416/// Internal helper: final bwt aux scan left to right 8u block place.
7417#[doc(hidden)]
7418pub fn final_bwt_aux_scan_left_to_right_8u_block_place(
7419    sa: &mut [SaSint],
7420    rm: SaSint,
7421    i_out: &mut [SaSint],
7422    buckets: &mut [SaSint],
7423    cache: &[ThreadCache],
7424    count: FastSint,
7425) {
7426    if count <= 0 {
7427        return;
7428    }
7429
7430    let count_usize = usize::try_from(count).expect("count must be non-negative");
7431    for entry in &cache[..count_usize] {
7432        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
7433        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
7434        sa[slot] = entry.index;
7435        buckets[symbol] += 1;
7436        if (entry.index & rm) == 0 {
7437            let sample_index = usize::try_from((entry.index & SAINT_MAX) / (rm + 1))
7438                .expect("sample index must be non-negative");
7439            i_out[sample_index] = buckets[symbol];
7440        }
7441    }
7442}
7443
7444/// Internal helper: final sorting scan left to right 32s block gather.
7445#[doc(hidden)]
7446pub fn final_sorting_scan_left_to_right_32s_block_gather(
7447    t: &[SaSint],
7448    sa: &mut [SaSint],
7449    cache: &mut [ThreadCache],
7450    omp_block_start: FastSint,
7451    omp_block_size: FastSint,
7452) {
7453    if omp_block_size <= 0 {
7454        return;
7455    }
7456    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
7457    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
7458    for offset in 0..size {
7459        let i = start + offset;
7460        let mut symbol = SAINT_MIN;
7461        let mut p = sa[i];
7462        sa[i] = p ^ SAINT_MIN;
7463        if p > 0 {
7464            p -= 1;
7465            let p_usize = p as usize;
7466            cache[offset].index = p
7467                | ((usize::from(t[p_usize - usize::from(p > 0)] < t[p_usize]) as SaSint)
7468                    << (SAINT_BIT - 1));
7469            symbol = t[p_usize];
7470        }
7471        cache[offset].symbol = symbol;
7472    }
7473}
7474
7475/// Internal helper: final sorting scan left to right 32s block sort.
7476#[doc(hidden)]
7477pub fn final_sorting_scan_left_to_right_32s_block_sort(
7478    t: &[SaSint],
7479    induction_bucket: &mut [SaSint],
7480    cache: &mut [ThreadCache],
7481    omp_block_start: FastSint,
7482    omp_block_size: FastSint,
7483) {
7484    if omp_block_size <= 0 {
7485        return;
7486    }
7487    let prefetch_distance = 64usize;
7488    let start = omp_block_start as usize;
7489    let block_end = start + omp_block_size as usize;
7490    let mut i = start;
7491    let mut j = start + (omp_block_size as usize).saturating_sub(prefetch_distance + 1);
7492
7493    while i < j {
7494        let ci = i - start;
7495        let v0 = cache[ci].symbol;
7496        if v0 >= 0 {
7497            let bucket_index0 = v0 as usize;
7498            cache[ci].symbol = induction_bucket[bucket_index0];
7499            induction_bucket[bucket_index0] += 1;
7500            if cache[ci].symbol < block_end as SaSint {
7501                let ni = cache[ci].symbol as usize;
7502                let cni = ni - start;
7503                let mut np = cache[ci].index;
7504                cache[ci].index = np ^ SAINT_MIN;
7505                if np > 0 {
7506                    np -= 1;
7507                    let np_usize = np as usize;
7508                    cache[cni].index = np
7509                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7510                            as SaSint)
7511                            << (SAINT_BIT - 1));
7512                    cache[cni].symbol = t[np_usize];
7513                }
7514            }
7515        }
7516
7517        let i1 = i + 1;
7518        let ci1 = i1 - start;
7519        let v1 = cache[ci1].symbol;
7520        if v1 >= 0 {
7521            let bucket_index1 = v1 as usize;
7522            cache[ci1].symbol = induction_bucket[bucket_index1];
7523            induction_bucket[bucket_index1] += 1;
7524            if cache[ci1].symbol < block_end as SaSint {
7525                let ni = cache[ci1].symbol as usize;
7526                let cni = ni - start;
7527                let mut np = cache[ci1].index;
7528                cache[ci1].index = np ^ SAINT_MIN;
7529                if np > 0 {
7530                    np -= 1;
7531                    let np_usize = np as usize;
7532                    cache[cni].index = np
7533                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7534                            as SaSint)
7535                            << (SAINT_BIT - 1));
7536                    cache[cni].symbol = t[np_usize];
7537                }
7538            }
7539        }
7540
7541        i += 2;
7542    }
7543
7544    j = block_end;
7545    while i < j {
7546        let ci = i - start;
7547        let v = cache[ci].symbol;
7548        if v >= 0 {
7549            let bucket_index = v as usize;
7550            cache[ci].symbol = induction_bucket[bucket_index];
7551            induction_bucket[bucket_index] += 1;
7552            if cache[ci].symbol < block_end as SaSint {
7553                let ni = cache[ci].symbol as usize;
7554                let cni = ni - start;
7555                let mut np = cache[ci].index;
7556                cache[ci].index = np ^ SAINT_MIN;
7557                if np > 0 {
7558                    np -= 1;
7559                    let np_usize = np as usize;
7560                    cache[cni].index = np
7561                        | ((usize::from(t[np_usize - usize::from(np > 0)] < t[np_usize])
7562                            as SaSint)
7563                            << (SAINT_BIT - 1));
7564                    cache[cni].symbol = t[np_usize];
7565                }
7566            }
7567        }
7568        i += 1;
7569    }
7570}
7571
7572/// Internal helper: final bwt scan left to right 8u block (OpenMP variant).
7573#[doc(hidden)]
7574pub fn final_bwt_scan_left_to_right_8u_block_omp(
7575    t: &[u8],
7576    sa: &mut [SaSint],
7577    k: SaSint,
7578    induction_bucket: &mut [SaSint],
7579    block_start: FastSint,
7580    block_size: FastSint,
7581    threads: SaSint,
7582    thread_state: &mut [ThreadState],
7583) {
7584    if block_size <= 0 {
7585        return;
7586    }
7587
7588    let k_usize = usize::try_from(k).expect("k must be non-negative");
7589    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7590    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7591        usize::try_from(threads)
7592            .expect("threads must be non-negative")
7593            .min(thread_state.len())
7594            .max(1)
7595    } else {
7596        1
7597    };
7598
7599    if omp_num_threads == 1 {
7600        final_bwt_scan_left_to_right_8u(t, sa, induction_bucket, block_start, block_size);
7601        return;
7602    }
7603
7604    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7605    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7606    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7607        let relative_start = thread_num * omp_block_stride;
7608        let size = if thread_num + 1 < omp_num_threads {
7609            omp_block_stride
7610        } else {
7611            block_size_usize - relative_start
7612        };
7613        state.count = final_bwt_scan_left_to_right_8u_block_prepare(
7614            t,
7615            sa,
7616            k,
7617            &mut state.buckets,
7618            &mut state.cache,
7619            (block_start_usize + relative_start) as FastSint,
7620            size as FastSint,
7621        );
7622    }
7623
7624    for state in thread_state.iter_mut().take(omp_num_threads) {
7625        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7626            let a = *bucket;
7627            let b = state.buckets[c];
7628            *bucket = a + b;
7629            state.buckets[c] = a;
7630        }
7631    }
7632
7633    for state in thread_state.iter_mut().take(omp_num_threads) {
7634        final_order_scan_left_to_right_8u_block_place(
7635            sa,
7636            &mut state.buckets,
7637            &state.cache,
7638            state.count,
7639        );
7640    }
7641}
7642
7643/// Internal helper: final bwt aux scan left to right 8u block (OpenMP variant).
7644#[doc(hidden)]
7645pub fn final_bwt_aux_scan_left_to_right_8u_block_omp(
7646    t: &[u8],
7647    sa: &mut [SaSint],
7648    k: SaSint,
7649    rm: SaSint,
7650    i_out: &mut [SaSint],
7651    induction_bucket: &mut [SaSint],
7652    block_start: FastSint,
7653    block_size: FastSint,
7654    threads: SaSint,
7655    thread_state: &mut [ThreadState],
7656) {
7657    if block_size <= 0 {
7658        return;
7659    }
7660
7661    let k_usize = usize::try_from(k).expect("k must be non-negative");
7662    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7663    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7664        usize::try_from(threads)
7665            .expect("threads must be non-negative")
7666            .min(thread_state.len())
7667            .max(1)
7668    } else {
7669        1
7670    };
7671
7672    if omp_num_threads == 1 {
7673        final_bwt_aux_scan_left_to_right_8u(
7674            t,
7675            sa,
7676            rm,
7677            i_out,
7678            induction_bucket,
7679            block_start,
7680            block_size,
7681        );
7682        return;
7683    }
7684
7685    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7686    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7687    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7688        let relative_start = thread_num * omp_block_stride;
7689        let size = if thread_num + 1 < omp_num_threads {
7690            omp_block_stride
7691        } else {
7692            block_size_usize - relative_start
7693        };
7694        state.count = final_bwt_scan_left_to_right_8u_block_prepare(
7695            t,
7696            sa,
7697            k,
7698            &mut state.buckets,
7699            &mut state.cache,
7700            (block_start_usize + relative_start) as FastSint,
7701            size as FastSint,
7702        );
7703    }
7704
7705    for state in thread_state.iter_mut().take(omp_num_threads) {
7706        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7707            let a = *bucket;
7708            let b = state.buckets[c];
7709            *bucket = a + b;
7710            state.buckets[c] = a;
7711        }
7712    }
7713
7714    for state in thread_state.iter_mut().take(omp_num_threads) {
7715        final_bwt_aux_scan_left_to_right_8u_block_place(
7716            sa,
7717            rm,
7718            i_out,
7719            &mut state.buckets,
7720            &state.cache,
7721            state.count,
7722        );
7723    }
7724}
7725
7726/// Internal helper: final sorting scan left to right 8u block (OpenMP variant).
7727#[doc(hidden)]
7728pub fn final_sorting_scan_left_to_right_8u_block_omp(
7729    t: &[u8],
7730    sa: &mut [SaSint],
7731    k: SaSint,
7732    induction_bucket: &mut [SaSint],
7733    block_start: FastSint,
7734    block_size: FastSint,
7735    threads: SaSint,
7736    thread_state: &mut [ThreadState],
7737) {
7738    if block_size <= 0 {
7739        return;
7740    }
7741
7742    let k_usize = usize::try_from(k).expect("k must be non-negative");
7743    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7744    let omp_num_threads = if threads > 1 && block_size_usize >= 64 * k_usize.max(256) {
7745        usize::try_from(threads)
7746            .expect("threads must be non-negative")
7747            .min(thread_state.len())
7748            .max(1)
7749    } else {
7750        1
7751    };
7752
7753    if omp_num_threads == 1 {
7754        final_sorting_scan_left_to_right_8u(t, sa, induction_bucket, block_start, block_size);
7755        return;
7756    }
7757
7758    let block_start_usize = usize::try_from(block_start).expect("block_start must be non-negative");
7759    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7760    for (thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
7761        let relative_start = thread_num * omp_block_stride;
7762        let size = if thread_num + 1 < omp_num_threads {
7763            omp_block_stride
7764        } else {
7765            block_size_usize - relative_start
7766        };
7767        state.count = final_sorting_scan_left_to_right_8u_block_prepare(
7768            t,
7769            sa,
7770            k,
7771            &mut state.buckets,
7772            &mut state.cache,
7773            (block_start_usize + relative_start) as FastSint,
7774            size as FastSint,
7775        );
7776    }
7777
7778    for state in thread_state.iter_mut().take(omp_num_threads) {
7779        for (c, bucket) in induction_bucket.iter_mut().take(k_usize).enumerate() {
7780            let a = *bucket;
7781            let b = state.buckets[c];
7782            *bucket = a + b;
7783            state.buckets[c] = a;
7784        }
7785    }
7786
7787    for state in thread_state.iter_mut().take(omp_num_threads) {
7788        final_order_scan_left_to_right_8u_block_place(
7789            sa,
7790            &mut state.buckets,
7791            &state.cache,
7792            state.count,
7793        );
7794    }
7795}
7796
7797/// Internal helper: final sorting scan left to right 32s block (OpenMP variant).
7798#[doc(hidden)]
7799pub fn final_sorting_scan_left_to_right_32s_block_omp(
7800    t: &[SaSint],
7801    sa: &mut [SaSint],
7802    buckets: &mut [SaSint],
7803    cache: &mut [ThreadCache],
7804    block_start: FastSint,
7805    block_size: FastSint,
7806    threads: SaSint,
7807) {
7808    if threads <= 1 || block_size < 16_384 {
7809        final_sorting_scan_left_to_right_32s(t, sa, buckets, block_start, block_size);
7810        return;
7811    }
7812
7813    final_sorting_scan_left_to_right_32s_block_gather(t, sa, cache, block_start, block_size);
7814    final_sorting_scan_left_to_right_32s_block_sort(t, buckets, cache, block_start, block_size);
7815    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
7816    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
7817    let omp_num_threads = threads_usize.min(block_size_usize);
7818    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
7819    for omp_thread_num in 0..omp_num_threads {
7820        let omp_block_start = omp_thread_num * omp_block_stride;
7821        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
7822            omp_block_stride
7823        } else {
7824            block_size_usize - omp_block_start
7825        };
7826        compact_and_place_cached_suffixes(
7827            sa,
7828            cache,
7829            omp_block_start as FastSint,
7830            omp_block_size as FastSint,
7831        );
7832    }
7833}
7834
7835/// Internal helper: final bwt scan left to right 8u (OpenMP variant).
7836#[doc(hidden)]
7837pub fn final_bwt_scan_left_to_right_8u_omp(
7838    t: &[u8],
7839    sa: &mut [SaSint],
7840    n: FastSint,
7841    k: SaSint,
7842    induction_bucket: &mut [SaSint],
7843    threads: SaSint,
7844    thread_state: &mut [ThreadState],
7845) {
7846    let n_usize = usize::try_from(n).expect("n must be non-negative");
7847    let last = n_usize - 1;
7848    let bucket = t[last] as usize;
7849    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
7850    sa[slot] =
7851        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
7852    induction_bucket[bucket] += 1;
7853
7854    if threads == 1 || n < 65_536 {
7855        final_bwt_scan_left_to_right_8u(t, sa, induction_bucket, 0, n);
7856        return;
7857    }
7858
7859    let mut block_start = 0usize;
7860    while block_start < n_usize {
7861        if sa[block_start] == 0 {
7862            block_start += 1;
7863        } else {
7864            let threads_usize = usize::try_from(threads)
7865                .expect("threads must be non-negative")
7866                .min(thread_state.len())
7867                .max(1);
7868            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
7869            let block_max_end = (block_start + max_span).min(n_usize);
7870            let mut block_end = block_start + 1;
7871            while block_end < block_max_end && sa[block_end] != 0 {
7872                block_end += 1;
7873            }
7874            let size = block_end - block_start;
7875
7876            if size < 32 {
7877                final_bwt_scan_left_to_right_8u(
7878                    t,
7879                    sa,
7880                    induction_bucket,
7881                    block_start as FastSint,
7882                    size as FastSint,
7883                );
7884            } else {
7885                final_bwt_scan_left_to_right_8u_block_omp(
7886                    t,
7887                    sa,
7888                    k,
7889                    induction_bucket,
7890                    block_start as FastSint,
7891                    size as FastSint,
7892                    threads,
7893                    thread_state,
7894                );
7895            }
7896            block_start = block_end;
7897        }
7898    }
7899}
7900
7901/// Internal helper: final bwt aux scan left to right 8u (OpenMP variant).
7902#[doc(hidden)]
7903pub fn final_bwt_aux_scan_left_to_right_8u_omp(
7904    t: &[u8],
7905    sa: &mut [SaSint],
7906    n: FastSint,
7907    k: SaSint,
7908    rm: SaSint,
7909    i_out: &mut [SaSint],
7910    induction_bucket: &mut [SaSint],
7911    threads: SaSint,
7912    thread_state: &mut [ThreadState],
7913) {
7914    let n_usize = usize::try_from(n).expect("n must be non-negative");
7915    let last = n_usize - 1;
7916    let bucket = t[last] as usize;
7917    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
7918    sa[slot] =
7919        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
7920    induction_bucket[bucket] += 1;
7921    if (((n as SaSint) - 1) & rm) == 0 {
7922        i_out[last / usize::try_from(rm + 1).expect("rm must allow positive step")] =
7923            induction_bucket[bucket];
7924    }
7925
7926    if threads == 1 || n < 65_536 {
7927        final_bwt_aux_scan_left_to_right_8u(t, sa, rm, i_out, induction_bucket, 0, n);
7928        return;
7929    }
7930
7931    let mut block_start = 0usize;
7932    while block_start < n_usize {
7933        if sa[block_start] == 0 {
7934            block_start += 1;
7935        } else {
7936            let threads_usize = usize::try_from(threads)
7937                .expect("threads must be non-negative")
7938                .min(thread_state.len())
7939                .max(1);
7940            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
7941            let block_max_end = (block_start + max_span).min(n_usize);
7942            let mut block_end = block_start + 1;
7943            while block_end < block_max_end && sa[block_end] != 0 {
7944                block_end += 1;
7945            }
7946            let size = block_end - block_start;
7947
7948            if size < 32 {
7949                final_bwt_aux_scan_left_to_right_8u(
7950                    t,
7951                    sa,
7952                    rm,
7953                    i_out,
7954                    induction_bucket,
7955                    block_start as FastSint,
7956                    size as FastSint,
7957                );
7958            } else {
7959                final_bwt_aux_scan_left_to_right_8u_block_omp(
7960                    t,
7961                    sa,
7962                    k,
7963                    rm,
7964                    i_out,
7965                    induction_bucket,
7966                    block_start as FastSint,
7967                    size as FastSint,
7968                    threads,
7969                    thread_state,
7970                );
7971            }
7972            block_start = block_end;
7973        }
7974    }
7975}
7976
7977/// Internal helper: final sorting scan left to right 8u (OpenMP variant).
7978#[doc(hidden)]
7979pub fn final_sorting_scan_left_to_right_8u_omp(
7980    t: &[u8],
7981    sa: &mut [SaSint],
7982    n: FastSint,
7983    k: SaSint,
7984    induction_bucket: &mut [SaSint],
7985    threads: SaSint,
7986    thread_state: &mut [ThreadState],
7987) {
7988    let n_usize = usize::try_from(n).expect("n must be non-negative");
7989    let last = n_usize - 1;
7990    let bucket = t[last] as usize;
7991    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
7992    sa[slot] =
7993        (n as SaSint - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
7994    induction_bucket[bucket] += 1;
7995
7996    if threads == 1 || n < 65_536 {
7997        final_sorting_scan_left_to_right_8u(t, sa, induction_bucket, 0, n);
7998        return;
7999    }
8000
8001    let mut block_start = 0usize;
8002    while block_start < n_usize {
8003        if sa[block_start] == 0 {
8004            block_start += 1;
8005        } else {
8006            let threads_usize = usize::try_from(threads)
8007                .expect("threads must be non-negative")
8008                .min(thread_state.len())
8009                .max(1);
8010            let max_span = threads_usize * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * threads_usize);
8011            let block_max_end = (block_start + max_span).min(n_usize);
8012            let mut block_end = block_start + 1;
8013            while block_end < block_max_end && sa[block_end] != 0 {
8014                block_end += 1;
8015            }
8016            let size = block_end - block_start;
8017
8018            if size < 32 {
8019                final_sorting_scan_left_to_right_8u(
8020                    t,
8021                    sa,
8022                    induction_bucket,
8023                    block_start as FastSint,
8024                    size as FastSint,
8025                );
8026            } else {
8027                final_sorting_scan_left_to_right_8u_block_omp(
8028                    t,
8029                    sa,
8030                    k,
8031                    induction_bucket,
8032                    block_start as FastSint,
8033                    size as FastSint,
8034                    threads,
8035                    thread_state,
8036                );
8037            }
8038            block_start = block_end;
8039        }
8040    }
8041}
8042
8043/// Internal helper: final sorting scan left to right 32s (OpenMP variant).
8044#[doc(hidden)]
8045pub fn final_sorting_scan_left_to_right_32s_omp(
8046    t: &[SaSint],
8047    sa: &mut [SaSint],
8048    n: SaSint,
8049    induction_bucket: &mut [SaSint],
8050    threads: SaSint,
8051    thread_state: &mut [ThreadState],
8052) {
8053    let n_usize = usize::try_from(n).expect("n must be non-negative");
8054    let last = n_usize - 1;
8055    let bucket = usize::try_from(t[last]).expect("bucket symbol must be non-negative");
8056    let slot = usize::try_from(induction_bucket[bucket]).expect("bucket slot must be non-negative");
8057    sa[slot] = (n - 1) | ((usize::from(t[last - 1] < t[last]) as SaSint) << (SAINT_BIT - 1));
8058    induction_bucket[bucket] += 1;
8059
8060    if threads == 1 || n < 65_536 {
8061        final_sorting_scan_left_to_right_32s(t, sa, induction_bucket, 0, n as FastSint);
8062        return;
8063    }
8064
8065    if thread_state.is_empty() {
8066        final_sorting_scan_left_to_right_32s(t, sa, induction_bucket, 0, n as FastSint);
8067        return;
8068    }
8069
8070    let threads_usize = usize::try_from(threads)
8071        .expect("threads must be non-negative")
8072        .max(1);
8073    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
8074    let mut block_start = 0usize;
8075    while block_start < n_usize {
8076        let block_end = (block_start + threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE).min(n_usize);
8077        final_sorting_scan_left_to_right_32s_block_omp(
8078            t,
8079            sa,
8080            induction_bucket,
8081            &mut cache,
8082            block_start as FastSint,
8083            (block_end - block_start) as FastSint,
8084            threads,
8085        );
8086        block_start = block_end;
8087    }
8088}
8089
8090/// Internal helper: final bwt scan right to left 8u.
8091#[doc(hidden)]
8092pub fn final_bwt_scan_right_to_left_8u(
8093    t: &[u8],
8094    sa: &mut [SaSint],
8095    induction_bucket: &mut [SaSint],
8096    omp_block_start: FastSint,
8097    omp_block_size: FastSint,
8098) -> SaSint {
8099    if omp_block_size <= 0 {
8100        return -1;
8101    }
8102
8103    let mut index = -1;
8104
8105    let start =
8106        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8107    let mut i = omp_block_start + omp_block_size - 1;
8108    let mut j = start + 1;
8109    while i >= j {
8110        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8111        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8112
8113        let mut p0 = sa[i0];
8114        if p0 == 0 {
8115            index = i0 as SaSint;
8116        }
8117        sa[i0] = p0 & SAINT_MAX;
8118        if p0 > 0 {
8119            p0 -= 1;
8120            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8121            let c0 = t[p0_usize - usize::from(p0 > 0)] as SaSint;
8122            let c1 = t[p0_usize] as SaSint;
8123            sa[i0] = c1;
8124            induction_bucket[c1 as usize] -= 1;
8125            let slot = usize::try_from(induction_bucket[c1 as usize])
8126                .expect("bucket slot must be non-negative");
8127            let marked = c0 | SAINT_MIN;
8128            sa[slot] = if c0 <= c1 { p0 } else { marked };
8129        }
8130
8131        let mut p1 = sa[i1];
8132        if p1 == 0 {
8133            index = i1 as SaSint;
8134        }
8135        sa[i1] = p1 & SAINT_MAX;
8136        if p1 > 0 {
8137            p1 -= 1;
8138            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8139            let c0 = t[p1_usize - usize::from(p1 > 0)] as SaSint;
8140            let c1 = t[p1_usize] as SaSint;
8141            sa[i1] = c1;
8142            induction_bucket[c1 as usize] -= 1;
8143            let slot = usize::try_from(induction_bucket[c1 as usize])
8144                .expect("bucket slot must be non-negative");
8145            let marked = c0 | SAINT_MIN;
8146            sa[slot] = if c0 <= c1 { p1 } else { marked };
8147        }
8148
8149        i -= 2;
8150    }
8151
8152    j -= 1;
8153    while i >= j {
8154        let idx = usize::try_from(i).expect("loop index must be non-negative");
8155        let mut p = sa[idx];
8156        if p == 0 {
8157            index = idx as SaSint;
8158        }
8159        sa[idx] = p & SAINT_MAX;
8160        if p > 0 {
8161            p -= 1;
8162            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8163            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8164            let c1 = t[p_usize] as SaSint;
8165            sa[idx] = c1;
8166            induction_bucket[c1 as usize] -= 1;
8167            let slot = usize::try_from(induction_bucket[c1 as usize])
8168                .expect("bucket slot must be non-negative");
8169            let marked = c0 | SAINT_MIN;
8170            sa[slot] = if c0 <= c1 { p } else { marked };
8171        }
8172
8173        i -= 1;
8174    }
8175
8176    index
8177}
8178
8179/// Internal helper: final bwt aux scan right to left 8u.
8180#[doc(hidden)]
8181pub fn final_bwt_aux_scan_right_to_left_8u(
8182    t: &[u8],
8183    sa: &mut [SaSint],
8184    rm: SaSint,
8185    i_out: &mut [SaSint],
8186    induction_bucket: &mut [SaSint],
8187    omp_block_start: FastSint,
8188    omp_block_size: FastSint,
8189) {
8190    if omp_block_size <= 0 {
8191        return;
8192    }
8193
8194    let start =
8195        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8196    let mut i = omp_block_start + omp_block_size - 1;
8197    let mut j = start + 1;
8198    while i >= j {
8199        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8200        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8201
8202        let mut p0 = sa[i0];
8203        sa[i0] = p0 & SAINT_MAX;
8204        if p0 > 0 {
8205            p0 -= 1;
8206            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8207            let c0 = t[p0_usize - usize::from(p0 > 0)] as SaSint;
8208            let c1 = t[p0_usize] as SaSint;
8209            sa[i0] = c1;
8210            induction_bucket[c1 as usize] -= 1;
8211            let slot = usize::try_from(induction_bucket[c1 as usize])
8212                .expect("bucket slot must be non-negative");
8213            let marked = c0 | SAINT_MIN;
8214            sa[slot] = if c0 <= c1 { p0 } else { marked };
8215            if (p0 & rm) == 0 {
8216                let out_idx =
8217                    usize::try_from(p0 / (rm + 1)).expect("sample index must be non-negative");
8218                i_out[out_idx] = induction_bucket[t[p0_usize] as usize] + 1;
8219            }
8220        }
8221
8222        let mut p1 = sa[i1];
8223        sa[i1] = p1 & SAINT_MAX;
8224        if p1 > 0 {
8225            p1 -= 1;
8226            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8227            let c0 = t[p1_usize - usize::from(p1 > 0)] as SaSint;
8228            let c1 = t[p1_usize] as SaSint;
8229            sa[i1] = c1;
8230            induction_bucket[c1 as usize] -= 1;
8231            let slot = usize::try_from(induction_bucket[c1 as usize])
8232                .expect("bucket slot must be non-negative");
8233            let marked = c0 | SAINT_MIN;
8234            sa[slot] = if c0 <= c1 { p1 } else { marked };
8235            if (p1 & rm) == 0 {
8236                let out_idx =
8237                    usize::try_from(p1 / (rm + 1)).expect("sample index must be non-negative");
8238                i_out[out_idx] = induction_bucket[t[p1_usize] as usize] + 1;
8239            }
8240        }
8241
8242        i -= 2;
8243    }
8244
8245    j -= 1;
8246    while i >= j {
8247        let idx = usize::try_from(i).expect("loop index must be non-negative");
8248        let mut p = sa[idx];
8249        sa[idx] = p & SAINT_MAX;
8250        if p > 0 {
8251            p -= 1;
8252            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8253            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8254            let c1 = t[p_usize] as SaSint;
8255            sa[idx] = c1;
8256            induction_bucket[c1 as usize] -= 1;
8257            let slot = usize::try_from(induction_bucket[c1 as usize])
8258                .expect("bucket slot must be non-negative");
8259            let marked = c0 | SAINT_MIN;
8260            sa[slot] = if c0 <= c1 { p } else { marked };
8261            if (p & rm) == 0 {
8262                let out_idx =
8263                    usize::try_from(p / (rm + 1)).expect("sample index must be non-negative");
8264                i_out[out_idx] = induction_bucket[t[p_usize] as usize] + 1;
8265            }
8266        }
8267
8268        i -= 1;
8269    }
8270}
8271
8272/// Internal helper: final sorting scan right to left 8u.
8273#[doc(hidden)]
8274pub fn final_sorting_scan_right_to_left_8u(
8275    t: &[u8],
8276    sa: &mut [SaSint],
8277    induction_bucket: &mut [SaSint],
8278    omp_block_start: FastSint,
8279    omp_block_size: FastSint,
8280) {
8281    if omp_block_size <= 0 {
8282        return;
8283    }
8284
8285    let prefetch_distance = 64usize;
8286    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8287    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8288    let mut i = start + size - 1;
8289    let mut j = start + prefetch_distance + 1;
8290
8291    while i >= j {
8292        let mut p0 = sa[i];
8293        sa[i] = p0 & SAINT_MAX;
8294        if p0 > 0 {
8295            p0 -= 1;
8296            let p0_usize = p0 as usize;
8297            let bucket0 = t[p0_usize] as usize;
8298            induction_bucket[bucket0] -= 1;
8299            let slot0 = induction_bucket[bucket0] as usize;
8300            sa[slot0] = p0
8301                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] > t[p0_usize]) as SaSint)
8302                    << (SAINT_BIT - 1));
8303        }
8304
8305        let mut p1 = sa[i - 1];
8306        sa[i - 1] = p1 & SAINT_MAX;
8307        if p1 > 0 {
8308            p1 -= 1;
8309            let p1_usize = p1 as usize;
8310            let bucket1 = t[p1_usize] as usize;
8311            induction_bucket[bucket1] -= 1;
8312            let slot1 = induction_bucket[bucket1] as usize;
8313            sa[slot1] = p1
8314                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] > t[p1_usize]) as SaSint)
8315                    << (SAINT_BIT - 1));
8316        }
8317
8318        i -= 2;
8319    }
8320
8321    j -= prefetch_distance + 1;
8322    while i >= j {
8323        let mut p = sa[i];
8324        sa[i] = p & SAINT_MAX;
8325        if p > 0 {
8326            p -= 1;
8327            let p_usize = p as usize;
8328            let bucket = t[p_usize] as usize;
8329            induction_bucket[bucket] -= 1;
8330            let slot = induction_bucket[bucket] as usize;
8331            sa[slot] = p
8332                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8333                    << (SAINT_BIT - 1));
8334        }
8335
8336        if i == 0 {
8337            break;
8338        }
8339        i -= 1;
8340    }
8341}
8342
8343/// Internal helper: final gsa scan right to left 8u.
8344#[doc(hidden)]
8345pub fn final_gsa_scan_right_to_left_8u(
8346    t: &[u8],
8347    sa: &mut [SaSint],
8348    induction_bucket: &mut [SaSint],
8349    omp_block_start: FastSint,
8350    omp_block_size: FastSint,
8351) {
8352    if omp_block_size <= 0 {
8353        return;
8354    }
8355
8356    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8357    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8358    let mut i = start + size;
8359    while i > start {
8360        i -= 1;
8361        let mut p = sa[i];
8362        sa[i] = p & SAINT_MAX;
8363        if p > 0 {
8364            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8365            if t[p_usize - 1] > 0 {
8366                p -= 1;
8367                let bucket =
8368                    t[usize::try_from(p).expect("suffix index must be non-negative")] as usize;
8369                induction_bucket[bucket] -= 1;
8370                let slot = usize::try_from(induction_bucket[bucket])
8371                    .expect("bucket slot must be non-negative");
8372                let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8373                sa[slot] = p
8374                    | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8375                        << (SAINT_BIT - 1));
8376            }
8377        }
8378    }
8379}
8380
8381/// Internal helper: final sorting scan right to left 32s.
8382#[doc(hidden)]
8383pub fn final_sorting_scan_right_to_left_32s(
8384    t: &[SaSint],
8385    sa: &mut [SaSint],
8386    induction_bucket: &mut [SaSint],
8387    omp_block_start: FastSint,
8388    omp_block_size: FastSint,
8389) {
8390    if omp_block_size <= 0 {
8391        return;
8392    }
8393
8394    let prefetch_distance: FastSint = 64;
8395    let mut i = omp_block_start + omp_block_size - 1;
8396    let mut j = omp_block_start + 2 * prefetch_distance + 1;
8397
8398    while i >= j {
8399        let i0 = i as usize;
8400        let mut p0 = sa[i0];
8401        sa[i0] = p0 & SAINT_MAX;
8402        if p0 > 0 {
8403            p0 -= 1;
8404            let p0u = p0 as usize;
8405            let bucket0 = t[p0u] as usize;
8406            induction_bucket[bucket0] -= 1;
8407            let slot0 = induction_bucket[bucket0] as usize;
8408            sa[slot0] = p0
8409                | ((usize::from(t[p0u - usize::from(p0 > 0)] > t[p0u]) as SaSint)
8410                    << (SAINT_BIT - 1));
8411        }
8412
8413        let i1 = (i - 1) as usize;
8414        let mut p1 = sa[i1];
8415        sa[i1] = p1 & SAINT_MAX;
8416        if p1 > 0 {
8417            p1 -= 1;
8418            let p1u = p1 as usize;
8419            let bucket1 = t[p1u] as usize;
8420            induction_bucket[bucket1] -= 1;
8421            let slot1 = induction_bucket[bucket1] as usize;
8422            sa[slot1] = p1
8423                | ((usize::from(t[p1u - usize::from(p1 > 0)] > t[p1u]) as SaSint)
8424                    << (SAINT_BIT - 1));
8425        }
8426        i -= 2;
8427    }
8428
8429    j -= 2 * prefetch_distance + 1;
8430    while i >= j {
8431        let iu = i as usize;
8432        let mut p = sa[iu];
8433        sa[iu] = p & SAINT_MAX;
8434        if p > 0 {
8435            p -= 1;
8436            let pu = p as usize;
8437            let bucket = t[pu] as usize;
8438            induction_bucket[bucket] -= 1;
8439            let slot = induction_bucket[bucket] as usize;
8440            sa[slot] = p
8441                | ((usize::from(t[pu - usize::from(p > 0)] > t[pu]) as SaSint) << (SAINT_BIT - 1));
8442        }
8443        i -= 1;
8444    }
8445}
8446
8447/// Internal helper: final bwt scan right to left 8u block prepare.
8448#[doc(hidden)]
8449pub fn final_bwt_scan_right_to_left_8u_block_prepare(
8450    t: &[u8],
8451    sa: &mut [SaSint],
8452    k: SaSint,
8453    buckets: &mut [SaSint],
8454    cache: &mut [ThreadCache],
8455    omp_block_start: FastSint,
8456    omp_block_size: FastSint,
8457) -> FastSint {
8458    if omp_block_size <= 0 {
8459        return 0;
8460    }
8461    let k_usize = usize::try_from(k).expect("k must be non-negative");
8462    buckets[..k_usize].fill(0);
8463    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8464    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8465    let mut count = 0usize;
8466    let mut i = start + size;
8467    while i > start {
8468        i -= 1;
8469        let mut p = sa[i];
8470        sa[i] = p & SAINT_MAX;
8471        if p > 0 {
8472            p -= 1;
8473            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8474            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8475            let c1 = t[p_usize] as SaSint;
8476            sa[i] = c1;
8477            buckets[c1 as usize] += 1;
8478            cache[count].symbol = c1;
8479            cache[count].index = if c0 <= c1 { p } else { c0 | SAINT_MIN };
8480            count += 1;
8481        }
8482    }
8483    count as FastSint
8484}
8485
8486/// Internal helper: final bwt aux scan right to left 8u block prepare.
8487#[doc(hidden)]
8488pub fn final_bwt_aux_scan_right_to_left_8u_block_prepare(
8489    t: &[u8],
8490    sa: &mut [SaSint],
8491    k: SaSint,
8492    buckets: &mut [SaSint],
8493    cache: &mut [ThreadCache],
8494    omp_block_start: FastSint,
8495    omp_block_size: FastSint,
8496) -> FastSint {
8497    if omp_block_size <= 0 {
8498        return 0;
8499    }
8500    let k_usize = usize::try_from(k).expect("k must be non-negative");
8501    buckets[..k_usize].fill(0);
8502    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
8503    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
8504    let mut count = 0usize;
8505    let mut i = start + size;
8506    while i > start {
8507        i -= 1;
8508        let mut p = sa[i];
8509        sa[i] = p & SAINT_MAX;
8510        if p > 0 {
8511            p -= 1;
8512            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8513            let c0 = t[p_usize - usize::from(p > 0)] as SaSint;
8514            let c1 = t[p_usize] as SaSint;
8515            sa[i] = c1;
8516            buckets[c1 as usize] += 1;
8517            cache[count].symbol = c1;
8518            cache[count].index = if c0 <= c1 { p } else { c0 | SAINT_MIN };
8519            cache[count + 1].index = p;
8520            count += 2;
8521        }
8522    }
8523    count as FastSint
8524}
8525
8526/// Internal helper: final sorting scan right to left 8u block prepare.
8527#[doc(hidden)]
8528pub fn final_sorting_scan_right_to_left_8u_block_prepare(
8529    t: &[u8],
8530    sa: &mut [SaSint],
8531    k: SaSint,
8532    buckets: &mut [SaSint],
8533    cache: &mut [ThreadCache],
8534    omp_block_start: FastSint,
8535    omp_block_size: FastSint,
8536) -> FastSint {
8537    if omp_block_size <= 0 {
8538        return 0;
8539    }
8540
8541    let k_usize = usize::try_from(k).expect("k must be non-negative");
8542    buckets[..k_usize].fill(0);
8543
8544    let start =
8545        usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") as FastSint;
8546    let mut i = omp_block_start + omp_block_size - 1;
8547    let mut j = start + 1;
8548    let mut count = 0usize;
8549
8550    while i >= j {
8551        let i0 = usize::try_from(i).expect("loop index must be non-negative");
8552        let i1 = usize::try_from(i - 1).expect("loop index must be non-negative");
8553
8554        let mut p0 = sa[i0];
8555        sa[i0] = p0 & SAINT_MAX;
8556        if p0 > 0 {
8557            p0 -= 1;
8558            let p0_usize = usize::try_from(p0).expect("suffix index must be non-negative");
8559            let c0 = t[p0_usize] as SaSint;
8560            buckets[c0 as usize] += 1;
8561            cache[count].symbol = c0;
8562            cache[count].index = p0
8563                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] > t[p0_usize]) as SaSint)
8564                    << (SAINT_BIT - 1));
8565            count += 1;
8566        }
8567
8568        let mut p1 = sa[i1];
8569        sa[i1] = p1 & SAINT_MAX;
8570        if p1 > 0 {
8571            p1 -= 1;
8572            let p1_usize = usize::try_from(p1).expect("suffix index must be non-negative");
8573            let c1 = t[p1_usize] as SaSint;
8574            buckets[c1 as usize] += 1;
8575            cache[count].symbol = c1;
8576            cache[count].index = p1
8577                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] > t[p1_usize]) as SaSint)
8578                    << (SAINT_BIT - 1));
8579            count += 1;
8580        }
8581
8582        i -= 2;
8583    }
8584
8585    j -= 1;
8586    while i >= j {
8587        let idx = usize::try_from(i).expect("loop index must be non-negative");
8588        let mut p = sa[idx];
8589        sa[idx] = p & SAINT_MAX;
8590        if p > 0 {
8591            p -= 1;
8592            let p_usize = usize::try_from(p).expect("suffix index must be non-negative");
8593            let c = t[p_usize] as SaSint;
8594            buckets[c as usize] += 1;
8595            cache[count].symbol = c;
8596            cache[count].index = p
8597                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8598                    << (SAINT_BIT - 1));
8599            count += 1;
8600        }
8601
8602        i -= 1;
8603    }
8604
8605    count as FastSint
8606}
8607
8608/// Internal helper: final order scan right to left 8u block place.
8609#[doc(hidden)]
8610pub fn final_order_scan_right_to_left_8u_block_place(
8611    sa: &mut [SaSint],
8612    buckets: &mut [SaSint],
8613    cache: &[ThreadCache],
8614    count: FastSint,
8615) {
8616    if count <= 0 {
8617        return;
8618    }
8619    let count_usize = usize::try_from(count).expect("count must be non-negative");
8620    for entry in &cache[..count_usize] {
8621        let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
8622        buckets[symbol] -= 1;
8623        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8624        sa[slot] = entry.index;
8625    }
8626}
8627
8628/// Internal helper: final gsa scan right to left 8u block place.
8629#[doc(hidden)]
8630pub fn final_gsa_scan_right_to_left_8u_block_place(
8631    sa: &mut [SaSint],
8632    buckets: &mut [SaSint],
8633    cache: &[ThreadCache],
8634    count: FastSint,
8635) {
8636    if count <= 0 {
8637        return;
8638    }
8639    let count_usize = usize::try_from(count).expect("count must be non-negative");
8640    for entry in &cache[..count_usize] {
8641        if entry.symbol > 0 {
8642            let symbol = usize::try_from(entry.symbol).expect("cache symbol must be non-negative");
8643            buckets[symbol] -= 1;
8644            let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8645            sa[slot] = entry.index;
8646        }
8647    }
8648}
8649
8650/// Internal helper: final bwt aux scan right to left 8u block place.
8651#[doc(hidden)]
8652pub fn final_bwt_aux_scan_right_to_left_8u_block_place(
8653    sa: &mut [SaSint],
8654    rm: SaSint,
8655    i_out: &mut [SaSint],
8656    buckets: &mut [SaSint],
8657    cache: &[ThreadCache],
8658    count: FastSint,
8659) {
8660    if count <= 0 {
8661        return;
8662    }
8663    let count_usize = usize::try_from(count).expect("count must be non-negative");
8664    let mut i = 0usize;
8665    while i < count_usize {
8666        let symbol = usize::try_from(cache[i].symbol).expect("cache symbol must be non-negative");
8667        buckets[symbol] -= 1;
8668        let slot = usize::try_from(buckets[symbol]).expect("bucket slot must be non-negative");
8669        sa[slot] = cache[i].index;
8670        if (cache[i + 1].index & rm) == 0 {
8671            let sample_index = usize::try_from((cache[i + 1].index & SAINT_MAX) / (rm + 1))
8672                .expect("sample index must be non-negative");
8673            i_out[sample_index] = buckets[symbol] + 1;
8674        }
8675        i += 2;
8676    }
8677}
8678
8679/// Internal helper: final sorting scan right to left 32s block gather.
8680#[doc(hidden)]
8681pub fn final_sorting_scan_right_to_left_32s_block_gather(
8682    t: &[SaSint],
8683    sa: &mut [SaSint],
8684    cache: &mut [ThreadCache],
8685    omp_block_start: FastSint,
8686    omp_block_size: FastSint,
8687) {
8688    if omp_block_size <= 0 {
8689        return;
8690    }
8691    let prefetch_distance = 64usize;
8692    let start = omp_block_start as usize;
8693    let block_end = start + omp_block_size as usize;
8694    let mut i = start;
8695    let mut j = block_end.saturating_sub(prefetch_distance + 1);
8696
8697    while i < j {
8698        let ci = i - start;
8699        let mut symbol0 = SAINT_MIN;
8700        let mut p0 = sa[i];
8701        sa[i] = p0 & SAINT_MAX;
8702        if p0 > 0 {
8703            p0 -= 1;
8704            let p0_usize = p0 as usize;
8705            cache[ci].index = p0
8706                | ((usize::from(t[p0_usize - usize::from(p0 > 0)] > t[p0_usize]) as SaSint)
8707                    << (SAINT_BIT - 1));
8708            symbol0 = t[p0_usize];
8709        }
8710        cache[ci].symbol = symbol0;
8711
8712        let i1 = i + 1;
8713        let ci1 = i1 - start;
8714        let mut symbol1 = SAINT_MIN;
8715        let mut p1 = sa[i1];
8716        sa[i1] = p1 & SAINT_MAX;
8717        if p1 > 0 {
8718            p1 -= 1;
8719            let p1_usize = p1 as usize;
8720            cache[ci1].index = p1
8721                | ((usize::from(t[p1_usize - usize::from(p1 > 0)] > t[p1_usize]) as SaSint)
8722                    << (SAINT_BIT - 1));
8723            symbol1 = t[p1_usize];
8724        }
8725        cache[ci1].symbol = symbol1;
8726
8727        i += 2;
8728    }
8729
8730    j = block_end;
8731    while i < j {
8732        let ci = i - start;
8733        let mut symbol = SAINT_MIN;
8734        let mut p = sa[i];
8735        sa[i] = p & SAINT_MAX;
8736        if p > 0 {
8737            p -= 1;
8738            let p_usize = p as usize;
8739            cache[ci].index = p
8740                | ((usize::from(t[p_usize - usize::from(p > 0)] > t[p_usize]) as SaSint)
8741                    << (SAINT_BIT - 1));
8742            symbol = t[p_usize];
8743        }
8744        cache[ci].symbol = symbol;
8745        i += 1;
8746    }
8747}
8748
8749/// Internal helper: final sorting scan right to left 32s block sort.
8750#[doc(hidden)]
8751pub fn final_sorting_scan_right_to_left_32s_block_sort(
8752    t: &[SaSint],
8753    induction_bucket: &mut [SaSint],
8754    cache: &mut [ThreadCache],
8755    omp_block_start: FastSint,
8756    omp_block_size: FastSint,
8757) {
8758    if omp_block_size <= 0 {
8759        return;
8760    }
8761    let prefetch_distance = 64usize;
8762    let start = omp_block_start as usize;
8763    let mut i = start + omp_block_size as usize - 1;
8764    let mut j = start + prefetch_distance + 1;
8765
8766    while i >= j {
8767        let ci = i - start;
8768        let v0 = cache[ci].symbol;
8769        if v0 >= 0 {
8770            let bucket_index0 = v0 as usize;
8771            induction_bucket[bucket_index0] -= 1;
8772            cache[ci].symbol = induction_bucket[bucket_index0];
8773            if cache[ci].symbol >= omp_block_start as SaSint {
8774                let ni = cache[ci].symbol as usize;
8775                let cni = ni - start;
8776                let mut np = cache[ci].index;
8777                cache[ci].index = np & SAINT_MAX;
8778                if np > 0 {
8779                    np -= 1;
8780                    let np_usize = np as usize;
8781                    cache[cni].index = np
8782                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8783                            as SaSint)
8784                            << (SAINT_BIT - 1));
8785                    cache[cni].symbol = t[np_usize];
8786                }
8787            }
8788        }
8789
8790        let i1 = i - 1;
8791        let ci1 = i1 - start;
8792        let v1 = cache[ci1].symbol;
8793        if v1 >= 0 {
8794            let bucket_index1 = v1 as usize;
8795            induction_bucket[bucket_index1] -= 1;
8796            cache[ci1].symbol = induction_bucket[bucket_index1];
8797            if cache[ci1].symbol >= omp_block_start as SaSint {
8798                let ni = cache[ci1].symbol as usize;
8799                let cni = ni - start;
8800                let mut np = cache[ci1].index;
8801                cache[ci1].index = np & SAINT_MAX;
8802                if np > 0 {
8803                    np -= 1;
8804                    let np_usize = np as usize;
8805                    cache[cni].index = np
8806                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8807                            as SaSint)
8808                            << (SAINT_BIT - 1));
8809                    cache[cni].symbol = t[np_usize];
8810                }
8811            }
8812        }
8813
8814        i -= 2;
8815    }
8816
8817    j -= prefetch_distance + 1;
8818    while i >= j {
8819        let ci = i - start;
8820        let v = cache[ci].symbol;
8821        if v >= 0 {
8822            let bucket_index = v as usize;
8823            induction_bucket[bucket_index] -= 1;
8824            cache[ci].symbol = induction_bucket[bucket_index];
8825            if cache[ci].symbol >= omp_block_start as SaSint {
8826                let ni = cache[ci].symbol as usize;
8827                let cni = ni - start;
8828                let mut np = cache[ci].index;
8829                cache[ci].index = np & SAINT_MAX;
8830                if np > 0 {
8831                    np -= 1;
8832                    let np_usize = np as usize;
8833                    cache[cni].index = np
8834                        | ((usize::from(t[np_usize - usize::from(np > 0)] > t[np_usize])
8835                            as SaSint)
8836                            << (SAINT_BIT - 1));
8837                    cache[cni].symbol = t[np_usize];
8838                }
8839            }
8840        }
8841
8842        if i == 0 {
8843            break;
8844        }
8845        i -= 1;
8846    }
8847}
8848
8849/// Internal helper: final bwt scan right to left 8u block (OpenMP variant).
8850#[doc(hidden)]
8851pub fn final_bwt_scan_right_to_left_8u_block_omp(
8852    t: &[u8],
8853    sa: &mut [SaSint],
8854    k: SaSint,
8855    induction_bucket: &mut [SaSint],
8856    block_start: FastSint,
8857    block_size: FastSint,
8858    threads: SaSint,
8859    thread_state: &mut [ThreadState],
8860) {
8861    if block_size <= 0 {
8862        return;
8863    }
8864    let k_usize = usize::try_from(k).expect("k must be non-negative");
8865    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8866    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8867    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
8868    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
8869        let _ = final_bwt_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
8870        return;
8871    }
8872
8873    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
8874    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
8875        let omp_block_start = omp_thread_num * omp_block_stride;
8876        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
8877            omp_block_stride
8878        } else {
8879            block_size_usize - omp_block_start
8880        };
8881        state.count = final_bwt_scan_right_to_left_8u_block_prepare(
8882            t,
8883            sa,
8884            k,
8885            &mut state.buckets,
8886            &mut state.cache,
8887            block_start + omp_block_start as FastSint,
8888            omp_block_size as FastSint,
8889        );
8890    }
8891    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
8892        for c in 0..k_usize {
8893            let a = induction_bucket[c];
8894            let b = state.buckets[c];
8895            induction_bucket[c] = a - b;
8896            state.buckets[c] = a;
8897        }
8898    }
8899    for state in thread_state.iter_mut().take(omp_num_threads) {
8900        final_order_scan_right_to_left_8u_block_place(
8901            sa,
8902            &mut state.buckets,
8903            &state.cache,
8904            state.count,
8905        );
8906    }
8907}
8908
8909/// Internal helper: final bwt aux scan right to left 8u block (OpenMP variant).
8910#[doc(hidden)]
8911pub fn final_bwt_aux_scan_right_to_left_8u_block_omp(
8912    t: &[u8],
8913    sa: &mut [SaSint],
8914    k: SaSint,
8915    rm: SaSint,
8916    i_out: &mut [SaSint],
8917    induction_bucket: &mut [SaSint],
8918    block_start: FastSint,
8919    block_size: FastSint,
8920    threads: SaSint,
8921    thread_state: &mut [ThreadState],
8922) {
8923    if block_size <= 0 {
8924        return;
8925    }
8926    let k_usize = usize::try_from(k).expect("k must be non-negative");
8927    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8928    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8929    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
8930    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
8931        final_bwt_aux_scan_right_to_left_8u(
8932            t,
8933            sa,
8934            rm,
8935            i_out,
8936            induction_bucket,
8937            block_start,
8938            block_size,
8939        );
8940        return;
8941    }
8942
8943    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
8944    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
8945        let omp_block_start = omp_thread_num * omp_block_stride;
8946        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
8947            omp_block_stride
8948        } else {
8949            block_size_usize - omp_block_start
8950        };
8951        state.count = final_bwt_aux_scan_right_to_left_8u_block_prepare(
8952            t,
8953            sa,
8954            k,
8955            &mut state.buckets,
8956            &mut state.cache,
8957            block_start + omp_block_start as FastSint,
8958            omp_block_size as FastSint,
8959        );
8960    }
8961    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
8962        for c in 0..k_usize {
8963            let a = induction_bucket[c];
8964            let b = state.buckets[c];
8965            induction_bucket[c] = a - b;
8966            state.buckets[c] = a;
8967        }
8968    }
8969    for state in thread_state.iter_mut().take(omp_num_threads) {
8970        final_bwt_aux_scan_right_to_left_8u_block_place(
8971            sa,
8972            rm,
8973            i_out,
8974            &mut state.buckets,
8975            &state.cache,
8976            state.count,
8977        );
8978    }
8979}
8980
8981/// Internal helper: final sorting scan right to left 8u block (OpenMP variant).
8982#[doc(hidden)]
8983pub fn final_sorting_scan_right_to_left_8u_block_omp(
8984    t: &[u8],
8985    sa: &mut [SaSint],
8986    k: SaSint,
8987    induction_bucket: &mut [SaSint],
8988    block_start: FastSint,
8989    block_size: FastSint,
8990    threads: SaSint,
8991    thread_state: &mut [ThreadState],
8992) {
8993    if block_size <= 0 {
8994        return;
8995    }
8996    let k_usize = usize::try_from(k).expect("k must be non-negative");
8997    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
8998    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
8999    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
9000    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
9001        final_sorting_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
9002        return;
9003    }
9004
9005    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
9006    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
9007        let omp_block_start = omp_thread_num * omp_block_stride;
9008        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9009            omp_block_stride
9010        } else {
9011            block_size_usize - omp_block_start
9012        };
9013        state.count = final_sorting_scan_right_to_left_8u_block_prepare(
9014            t,
9015            sa,
9016            k,
9017            &mut state.buckets,
9018            &mut state.cache,
9019            block_start + omp_block_start as FastSint,
9020            omp_block_size as FastSint,
9021        );
9022    }
9023    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
9024        for c in 0..k_usize {
9025            let a = induction_bucket[c];
9026            let b = state.buckets[c];
9027            induction_bucket[c] = a - b;
9028            state.buckets[c] = a;
9029        }
9030    }
9031    for state in thread_state.iter_mut().take(omp_num_threads) {
9032        final_order_scan_right_to_left_8u_block_place(
9033            sa,
9034            &mut state.buckets,
9035            &state.cache,
9036            state.count,
9037        );
9038    }
9039}
9040
9041/// Internal helper: final gsa scan right to left 8u block (OpenMP variant).
9042#[doc(hidden)]
9043pub fn final_gsa_scan_right_to_left_8u_block_omp(
9044    t: &[u8],
9045    sa: &mut [SaSint],
9046    k: SaSint,
9047    induction_bucket: &mut [SaSint],
9048    block_start: FastSint,
9049    block_size: FastSint,
9050    threads: SaSint,
9051    thread_state: &mut [ThreadState],
9052) {
9053    if block_size <= 0 {
9054        return;
9055    }
9056    let k_usize = usize::try_from(k).expect("k must be non-negative");
9057    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
9058    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
9059    let omp_num_threads = threads_usize.min(thread_state.len()).min(block_size_usize);
9060    if omp_num_threads <= 1 || block_size < 64 * k.max(256) as FastSint {
9061        final_gsa_scan_right_to_left_8u(t, sa, induction_bucket, block_start, block_size);
9062        return;
9063    }
9064
9065    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
9066    for (omp_thread_num, state) in thread_state.iter_mut().take(omp_num_threads).enumerate() {
9067        let omp_block_start = omp_thread_num * omp_block_stride;
9068        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9069            omp_block_stride
9070        } else {
9071            block_size_usize - omp_block_start
9072        };
9073        state.count = final_sorting_scan_right_to_left_8u_block_prepare(
9074            t,
9075            sa,
9076            k,
9077            &mut state.buckets,
9078            &mut state.cache,
9079            block_start + omp_block_start as FastSint,
9080            omp_block_size as FastSint,
9081        );
9082    }
9083    for state in thread_state.iter_mut().take(omp_num_threads).rev() {
9084        for c in 0..k_usize {
9085            let a = induction_bucket[c];
9086            let b = state.buckets[c];
9087            induction_bucket[c] = a - b;
9088            state.buckets[c] = a;
9089        }
9090    }
9091    for state in thread_state.iter_mut().take(omp_num_threads) {
9092        final_gsa_scan_right_to_left_8u_block_place(
9093            sa,
9094            &mut state.buckets,
9095            &state.cache,
9096            state.count,
9097        );
9098    }
9099}
9100
9101/// Internal helper: final sorting scan right to left 32s block (OpenMP variant).
9102#[doc(hidden)]
9103pub fn final_sorting_scan_right_to_left_32s_block_omp(
9104    t: &[SaSint],
9105    sa: &mut [SaSint],
9106    buckets: &mut [SaSint],
9107    cache: &mut [ThreadCache],
9108    block_start: FastSint,
9109    block_size: FastSint,
9110    threads: SaSint,
9111) {
9112    if threads <= 1 || block_size < 16_384 {
9113        final_sorting_scan_right_to_left_32s(t, sa, buckets, block_start, block_size);
9114        return;
9115    }
9116
9117    final_sorting_scan_right_to_left_32s_block_gather(t, sa, cache, block_start, block_size);
9118    final_sorting_scan_right_to_left_32s_block_sort(t, buckets, cache, block_start, block_size);
9119    let block_size_usize = usize::try_from(block_size).expect("block_size must be non-negative");
9120    let threads_usize = usize::try_from(threads.max(1)).expect("threads must be positive");
9121    let omp_num_threads = threads_usize.min(block_size_usize);
9122    let omp_block_stride = (block_size_usize / omp_num_threads) & !15usize;
9123    for omp_thread_num in 0..omp_num_threads {
9124        let omp_block_start = omp_thread_num * omp_block_stride;
9125        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9126            omp_block_stride
9127        } else {
9128            block_size_usize - omp_block_start
9129        };
9130        compact_and_place_cached_suffixes(
9131            sa,
9132            cache,
9133            omp_block_start as FastSint,
9134            omp_block_size as FastSint,
9135        );
9136    }
9137}
9138
9139/// Internal helper: final bwt scan right to left 8u (OpenMP variant).
9140#[doc(hidden)]
9141pub fn final_bwt_scan_right_to_left_8u_omp(
9142    t: &[u8],
9143    sa: &mut [SaSint],
9144    n: SaSint,
9145    k: SaSint,
9146    induction_bucket: &mut [SaSint],
9147    threads: SaSint,
9148    thread_state: &mut [ThreadState],
9149) -> SaSint {
9150    if threads == 1 || n < 65_536 {
9151        return final_bwt_scan_right_to_left_8u(t, sa, induction_bucket, 0, n as FastSint);
9152    }
9153    let mut index = -1;
9154    let mut block_start = usize::try_from(n).expect("n must be non-negative");
9155    while block_start > 0 {
9156        block_start -= 1;
9157        if sa[block_start] == 0 {
9158            index = block_start as SaSint;
9159        } else {
9160            let threads_usize = usize::try_from(threads)
9161                .expect("threads must be non-negative")
9162                .min(thread_state.len())
9163                .max(1);
9164            let max_back =
9165                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9166            let block_max_end = block_start.saturating_sub(max_back);
9167            let mut block_end = block_start;
9168            while block_end > block_max_end && sa[block_end - 1] != 0 {
9169                block_end -= 1;
9170            }
9171            let size = block_start - block_end + 1;
9172            if size < 32 {
9173                let res = final_bwt_scan_right_to_left_8u(
9174                    t,
9175                    sa,
9176                    induction_bucket,
9177                    block_end as FastSint,
9178                    size as FastSint,
9179                );
9180                if res >= 0 {
9181                    index = res;
9182                }
9183            } else {
9184                final_bwt_scan_right_to_left_8u_block_omp(
9185                    t,
9186                    sa,
9187                    k,
9188                    induction_bucket,
9189                    block_end as FastSint,
9190                    size as FastSint,
9191                    threads,
9192                    thread_state,
9193                );
9194            }
9195            block_start = block_end;
9196        }
9197    }
9198    index
9199}
9200
9201/// Internal helper: final bwt aux scan right to left 8u (OpenMP variant).
9202#[doc(hidden)]
9203pub fn final_bwt_aux_scan_right_to_left_8u_omp(
9204    t: &[u8],
9205    sa: &mut [SaSint],
9206    n: SaSint,
9207    k: SaSint,
9208    rm: SaSint,
9209    i_out: &mut [SaSint],
9210    induction_bucket: &mut [SaSint],
9211    threads: SaSint,
9212    thread_state: &mut [ThreadState],
9213) {
9214    if threads == 1 || n < 65_536 {
9215        final_bwt_aux_scan_right_to_left_8u(t, sa, rm, i_out, induction_bucket, 0, n as FastSint);
9216        return;
9217    }
9218    let mut block_start = usize::try_from(n).expect("n must be non-negative");
9219    while block_start > 0 {
9220        block_start -= 1;
9221        if sa[block_start] != 0 {
9222            let threads_usize = usize::try_from(threads)
9223                .expect("threads must be non-negative")
9224                .min(thread_state.len())
9225                .max(1);
9226            let max_back = threads_usize
9227                * (LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize) / 2);
9228            let block_max_end = block_start.saturating_sub(max_back);
9229            let mut block_end = block_start;
9230            while block_end > block_max_end && sa[block_end - 1] != 0 {
9231                block_end -= 1;
9232            }
9233            let size = block_start - block_end + 1;
9234            if size < 32 {
9235                final_bwt_aux_scan_right_to_left_8u(
9236                    t,
9237                    sa,
9238                    rm,
9239                    i_out,
9240                    induction_bucket,
9241                    block_end as FastSint,
9242                    size as FastSint,
9243                );
9244            } else {
9245                final_bwt_aux_scan_right_to_left_8u_block_omp(
9246                    t,
9247                    sa,
9248                    k,
9249                    rm,
9250                    i_out,
9251                    induction_bucket,
9252                    block_end as FastSint,
9253                    size as FastSint,
9254                    threads,
9255                    thread_state,
9256                );
9257            }
9258            block_start = block_end;
9259        }
9260    }
9261}
9262
9263/// Internal helper: final sorting scan right to left 8u (OpenMP variant).
9264#[doc(hidden)]
9265pub fn final_sorting_scan_right_to_left_8u_omp(
9266    t: &[u8],
9267    sa: &mut [SaSint],
9268    omp_block_start: FastSint,
9269    omp_block_size: FastSint,
9270    k: SaSint,
9271    induction_bucket: &mut [SaSint],
9272    threads: SaSint,
9273    thread_state: &mut [ThreadState],
9274) {
9275    if threads == 1 || omp_block_size < 65_536 {
9276        final_sorting_scan_right_to_left_8u(
9277            t,
9278            sa,
9279            induction_bucket,
9280            omp_block_start,
9281            omp_block_size,
9282        );
9283        return;
9284    }
9285    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9286    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9287    let mut block_start = start + size;
9288    while block_start > start {
9289        block_start -= 1;
9290        if sa[block_start] != 0 {
9291            let threads_usize = usize::try_from(threads)
9292                .expect("threads must be non-negative")
9293                .min(thread_state.len())
9294                .max(1);
9295            let max_back =
9296                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9297            let block_max_end = block_start.saturating_sub(max_back).max(start);
9298            let mut block_end = block_start;
9299            while block_end > block_max_end && sa[block_end - 1] != 0 {
9300                block_end -= 1;
9301            }
9302            let span = block_start - block_end + 1;
9303            if span < 32 {
9304                final_sorting_scan_right_to_left_8u(
9305                    t,
9306                    sa,
9307                    induction_bucket,
9308                    block_end as FastSint,
9309                    span as FastSint,
9310                );
9311            } else {
9312                final_sorting_scan_right_to_left_8u_block_omp(
9313                    t,
9314                    sa,
9315                    k,
9316                    induction_bucket,
9317                    block_end as FastSint,
9318                    span as FastSint,
9319                    threads,
9320                    thread_state,
9321                );
9322            }
9323            block_start = block_end;
9324        }
9325    }
9326}
9327
9328/// Internal helper: final gsa scan right to left 8u (OpenMP variant).
9329#[doc(hidden)]
9330pub fn final_gsa_scan_right_to_left_8u_omp(
9331    t: &[u8],
9332    sa: &mut [SaSint],
9333    omp_block_start: FastSint,
9334    omp_block_size: FastSint,
9335    k: SaSint,
9336    induction_bucket: &mut [SaSint],
9337    threads: SaSint,
9338    thread_state: &mut [ThreadState],
9339) {
9340    if threads == 1 || omp_block_size < 65_536 {
9341        final_gsa_scan_right_to_left_8u(t, sa, induction_bucket, omp_block_start, omp_block_size);
9342        return;
9343    }
9344    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9345    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9346    let mut block_start = start + size;
9347    while block_start > start {
9348        block_start -= 1;
9349        if sa[block_start] != 0 {
9350            let threads_usize = usize::try_from(threads)
9351                .expect("threads must be non-negative")
9352                .min(thread_state.len())
9353                .max(1);
9354            let max_back =
9355                threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE.saturating_sub(16 * threads_usize);
9356            let block_max_end = block_start.saturating_sub(max_back).max(start);
9357            let mut block_end = block_start;
9358            while block_end > block_max_end && sa[block_end - 1] != 0 {
9359                block_end -= 1;
9360            }
9361            let span = block_start - block_end + 1;
9362            if span < 32 {
9363                final_gsa_scan_right_to_left_8u(
9364                    t,
9365                    sa,
9366                    induction_bucket,
9367                    block_end as FastSint,
9368                    span as FastSint,
9369                );
9370            } else {
9371                final_gsa_scan_right_to_left_8u_block_omp(
9372                    t,
9373                    sa,
9374                    k,
9375                    induction_bucket,
9376                    block_end as FastSint,
9377                    span as FastSint,
9378                    threads,
9379                    thread_state,
9380                );
9381            }
9382            block_start = block_end;
9383        }
9384    }
9385}
9386
9387/// Internal helper: final sorting scan right to left 32s (OpenMP variant).
9388#[doc(hidden)]
9389pub fn final_sorting_scan_right_to_left_32s_omp(
9390    t: &[SaSint],
9391    sa: &mut [SaSint],
9392    n: SaSint,
9393    induction_bucket: &mut [SaSint],
9394    threads: SaSint,
9395    thread_state: &mut [ThreadState],
9396) {
9397    if threads == 1 || n < 65_536 {
9398        final_sorting_scan_right_to_left_32s(t, sa, induction_bucket, 0, n as FastSint);
9399        return;
9400    }
9401    if thread_state.is_empty() {
9402        final_sorting_scan_right_to_left_32s(t, sa, induction_bucket, 0, n as FastSint);
9403        return;
9404    }
9405    let threads_usize = usize::try_from(threads)
9406        .expect("threads must be non-negative")
9407        .max(1);
9408    let mut cache = vec![ThreadCache::default(); threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE];
9409    let mut block_start = isize::try_from(n).expect("n must fit isize") - 1;
9410    while block_start >= 0 {
9411        let block_end = (block_start
9412            - isize::try_from(threads_usize * LIBSAIS_PER_THREAD_CACHE_SIZE)
9413                .expect("block span must fit isize"))
9414        .max(-1);
9415        final_sorting_scan_right_to_left_32s_block_omp(
9416            t,
9417            sa,
9418            induction_bucket,
9419            &mut cache,
9420            (block_end + 1) as FastSint,
9421            (block_start - block_end) as FastSint,
9422            threads,
9423        );
9424        block_start = block_end;
9425    }
9426}
9427
9428/// Internal helper: clear lms suffixes (OpenMP variant).
9429#[doc(hidden)]
9430pub fn clear_lms_suffixes_omp(
9431    sa: &mut [SaSint],
9432    n: SaSint,
9433    k: SaSint,
9434    bucket_start: &[SaSint],
9435    bucket_end: &[SaSint],
9436    threads: SaSint,
9437) {
9438    let k_usize = usize::try_from(k).expect("k must be non-negative");
9439    let thread_count = if threads > 1 && n >= 65536 {
9440        usize::try_from(threads).expect("threads must be positive")
9441    } else {
9442        1
9443    };
9444    for t in 0..thread_count {
9445        let mut c = t;
9446        while c < k_usize {
9447            if bucket_end[c] > bucket_start[c] {
9448                let start =
9449                    usize::try_from(bucket_start[c]).expect("bucket start must be non-negative");
9450                let end = usize::try_from(bucket_end[c]).expect("bucket end must be non-negative");
9451                sa[start..end].fill(0);
9452            }
9453            c += thread_count;
9454        }
9455    }
9456}
9457
9458/// Internal helper: induce final order 8u (OpenMP variant).
9459#[doc(hidden)]
9460pub fn induce_final_order_8u_omp(
9461    t: &[u8],
9462    sa: &mut [SaSint],
9463    n: SaSint,
9464    k: SaSint,
9465    flags: SaSint,
9466    r: SaSint,
9467    i_out: Option<&mut [SaSint]>,
9468    buckets: &mut [SaSint],
9469    threads: SaSint,
9470    thread_state: &mut [ThreadState],
9471) -> SaSint {
9472    if (flags & LIBSAIS_FLAGS_BWT) == 0 {
9473        if (flags & LIBSAIS_FLAGS_GSA) != 0 {
9474            buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1;
9475        }
9476
9477        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9478        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9479        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9480
9481        final_sorting_scan_left_to_right_8u_omp(
9482            t,
9483            sa,
9484            n as FastSint,
9485            k,
9486            bucket_start,
9487            threads,
9488            thread_state,
9489        );
9490        if threads > 1 && n >= 65_536 {
9491            clear_lms_suffixes_omp(
9492                sa,
9493                n,
9494                ALPHABET_SIZE as SaSint,
9495                bucket_start,
9496                bucket_end,
9497                threads,
9498            );
9499        }
9500
9501        if (flags & LIBSAIS_FLAGS_GSA) != 0 {
9502            flip_suffix_markers_omp(sa, bucket_end[0], threads);
9503            final_gsa_scan_right_to_left_8u_omp(
9504                t,
9505                sa,
9506                bucket_end[0] as FastSint,
9507                n as FastSint - bucket_end[0] as FastSint,
9508                k,
9509                bucket_end,
9510                1,
9511                thread_state,
9512            );
9513        } else {
9514            final_sorting_scan_right_to_left_8u_omp(
9515                t,
9516                sa,
9517                0,
9518                n as FastSint,
9519                k,
9520                bucket_end,
9521                threads,
9522                thread_state,
9523            );
9524        }
9525
9526        0
9527    } else if let Some(i_out) = i_out {
9528        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9529        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9530        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9531
9532        final_bwt_aux_scan_left_to_right_8u_omp(
9533            t,
9534            sa,
9535            n as FastSint,
9536            k,
9537            r - 1,
9538            i_out,
9539            bucket_start,
9540            threads,
9541            thread_state,
9542        );
9543        if threads > 1 && n >= 65_536 {
9544            clear_lms_suffixes_omp(
9545                sa,
9546                n,
9547                ALPHABET_SIZE as SaSint,
9548                bucket_start,
9549                bucket_end,
9550                threads,
9551            );
9552        }
9553        final_bwt_aux_scan_right_to_left_8u_omp(
9554            t,
9555            sa,
9556            n,
9557            k,
9558            r - 1,
9559            i_out,
9560            bucket_end,
9561            threads,
9562            thread_state,
9563        );
9564        0
9565    } else {
9566        let (left_buckets, right_tail) = buckets.split_at_mut(7 * ALPHABET_SIZE);
9567        let bucket_start = &mut left_buckets[6 * ALPHABET_SIZE..7 * ALPHABET_SIZE];
9568        let bucket_end = &mut right_tail[..ALPHABET_SIZE];
9569
9570        final_bwt_scan_left_to_right_8u_omp(
9571            t,
9572            sa,
9573            n as FastSint,
9574            k,
9575            bucket_start,
9576            threads,
9577            thread_state,
9578        );
9579        if threads > 1 && n >= 65_536 {
9580            clear_lms_suffixes_omp(
9581                sa,
9582                n,
9583                ALPHABET_SIZE as SaSint,
9584                bucket_start,
9585                bucket_end,
9586                threads,
9587            );
9588        }
9589        final_bwt_scan_right_to_left_8u_omp(t, sa, n, k, bucket_end, threads, thread_state)
9590    }
9591}
9592
9593/// Internal helper: induce final order 32s 6k.
9594#[doc(hidden)]
9595pub fn induce_final_order_32s_6k(
9596    t: &[SaSint],
9597    sa: &mut [SaSint],
9598    n: SaSint,
9599    k: SaSint,
9600    buckets: &mut [SaSint],
9601    threads: SaSint,
9602    thread_state: &mut [ThreadState],
9603) {
9604    let k_usize = usize::try_from(k).expect("k must be non-negative");
9605    let (_head, tail) = buckets.split_at_mut(4 * k_usize);
9606    let (left, right) = tail.split_at_mut(k_usize);
9607    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9608    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9609}
9610
9611/// Internal helper: induce final order 32s 4k.
9612#[doc(hidden)]
9613pub fn induce_final_order_32s_4k(
9614    t: &[SaSint],
9615    sa: &mut [SaSint],
9616    n: SaSint,
9617    k: SaSint,
9618    buckets: &mut [SaSint],
9619    threads: SaSint,
9620    thread_state: &mut [ThreadState],
9621) {
9622    let k_usize = usize::try_from(k).expect("k must be non-negative");
9623    let (_head, tail) = buckets.split_at_mut(2 * k_usize);
9624    let (left, right) = tail.split_at_mut(k_usize);
9625    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9626    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9627}
9628
9629/// Internal helper: induce final order 32s 2k.
9630#[doc(hidden)]
9631pub fn induce_final_order_32s_2k(
9632    t: &[SaSint],
9633    sa: &mut [SaSint],
9634    n: SaSint,
9635    k: SaSint,
9636    buckets: &mut [SaSint],
9637    threads: SaSint,
9638    thread_state: &mut [ThreadState],
9639) {
9640    let k_usize = usize::try_from(k).expect("k must be non-negative");
9641    let (right, left) = buckets.split_at_mut(k_usize);
9642    final_sorting_scan_left_to_right_32s_omp(t, sa, n, left, threads, thread_state);
9643    final_sorting_scan_right_to_left_32s_omp(t, sa, n, right, threads, thread_state);
9644}
9645
9646/// Internal helper: induce final order 32s 1k.
9647#[doc(hidden)]
9648pub fn induce_final_order_32s_1k(
9649    t: &[SaSint],
9650    sa: &mut [SaSint],
9651    n: SaSint,
9652    k: SaSint,
9653    buckets: &mut [SaSint],
9654    threads: SaSint,
9655    thread_state: &mut [ThreadState],
9656) {
9657    count_suffixes_32s(t, n, k, buckets);
9658    initialize_buckets_start_32s_1k(k, buckets);
9659    final_sorting_scan_left_to_right_32s_omp(t, sa, n, buckets, threads, thread_state);
9660
9661    count_suffixes_32s(t, n, k, buckets);
9662    initialize_buckets_end_32s_1k(k, buckets);
9663    final_sorting_scan_right_to_left_32s_omp(t, sa, n, buckets, threads, thread_state);
9664}
9665
9666/// Internal helper: renumber unique and nonunique lms suffixes 32s.
9667#[doc(hidden)]
9668pub fn renumber_unique_and_nonunique_lms_suffixes_32s(
9669    t: &mut [SaSint],
9670    sa: &mut [SaSint],
9671    m: SaSint,
9672    mut f: SaSint,
9673    omp_block_start: FastSint,
9674    omp_block_size: FastSint,
9675) -> SaSint {
9676    if omp_block_size <= 0 {
9677        return f;
9678    }
9679
9680    let prefetch_distance = 64 as SaSint;
9681    let m_usize = usize::try_from(m).expect("m must be non-negative");
9682    let (sa_head, sam) = sa.split_at_mut(m_usize);
9683    let mut i = omp_block_start as SaSint;
9684    let mut j = omp_block_start as SaSint + omp_block_size as SaSint - 2 * prefetch_distance - 3;
9685
9686    while i < j {
9687        let p0 = sa_head[i as usize] as SaUint;
9688        let p0_half = (p0 >> 1) as usize;
9689        let mut s0 = sam[p0_half];
9690        if s0 < 0 {
9691            t[p0 as usize] |= SAINT_MIN;
9692            f += 1;
9693            s0 = i + SAINT_MIN + f;
9694        }
9695        sam[p0_half] = s0 - f;
9696
9697        let p1 = sa_head[(i + 1) as usize] as SaUint;
9698        let p1_half = (p1 >> 1) as usize;
9699        let mut s1 = sam[p1_half];
9700        if s1 < 0 {
9701            t[p1 as usize] |= SAINT_MIN;
9702            f += 1;
9703            s1 = i + 1 + SAINT_MIN + f;
9704        }
9705        sam[p1_half] = s1 - f;
9706
9707        let p2 = sa_head[(i + 2) as usize] as SaUint;
9708        let p2_half = (p2 >> 1) as usize;
9709        let mut s2 = sam[p2_half];
9710        if s2 < 0 {
9711            t[p2 as usize] |= SAINT_MIN;
9712            f += 1;
9713            s2 = i + 2 + SAINT_MIN + f;
9714        }
9715        sam[p2_half] = s2 - f;
9716
9717        let p3 = sa_head[(i + 3) as usize] as SaUint;
9718        let p3_half = (p3 >> 1) as usize;
9719        let mut s3 = sam[p3_half];
9720        if s3 < 0 {
9721            t[p3 as usize] |= SAINT_MIN;
9722            f += 1;
9723            s3 = i + 3 + SAINT_MIN + f;
9724        }
9725        sam[p3_half] = s3 - f;
9726
9727        i += 4;
9728    }
9729
9730    j += 2 * prefetch_distance + 3;
9731    while i < j {
9732        let p = sa_head[i as usize] as SaUint;
9733        let p_half = (p >> 1) as usize;
9734        let mut s = sam[p_half];
9735        if s < 0 {
9736            t[p as usize] |= SAINT_MIN;
9737            f += 1;
9738            s = i + SAINT_MIN + f;
9739        }
9740        sam[p_half] = s - f;
9741        i += 1;
9742    }
9743
9744    f
9745}
9746
9747/// Internal helper: compact unique and nonunique lms suffixes 32s.
9748#[doc(hidden)]
9749pub fn compact_unique_and_nonunique_lms_suffixes_32s(
9750    sa: &mut [SaSint],
9751    m: SaSint,
9752    pl: &mut FastSint,
9753    pr: &mut FastSint,
9754    omp_block_start: FastSint,
9755    omp_block_size: FastSint,
9756) {
9757    if omp_block_size <= 0 {
9758        return;
9759    }
9760
9761    let m_usize = usize::try_from(m).expect("m must be non-negative");
9762    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9763    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9764
9765    let source: Vec<SaSint> = sa[m_usize + start..m_usize + start + size].to_vec();
9766    let mut l = usize::try_from(*pl - 1).expect("left position must be positive");
9767    let mut r = usize::try_from(*pr - 1).expect("right position must be positive");
9768
9769    for &p in source.iter().rev() {
9770        let pu = p as SaUint;
9771        sa[l] = (pu & SAINT_MAX as SaUint) as SaSint;
9772        l = l.saturating_sub(usize::from((pu as SaSint) < 0));
9773
9774        sa[r] = pu.wrapping_sub(1) as SaSint;
9775        r = r.saturating_sub(usize::from((pu as SaSint) > 0));
9776    }
9777
9778    *pl = l as FastSint + 1;
9779    *pr = r as FastSint + 1;
9780}
9781
9782/// Internal helper: count unique suffixes.
9783#[doc(hidden)]
9784pub fn count_unique_suffixes(
9785    sa: &[SaSint],
9786    m: SaSint,
9787    omp_block_start: FastSint,
9788    omp_block_size: FastSint,
9789) -> SaSint {
9790    if omp_block_size <= 0 {
9791        return 0;
9792    }
9793
9794    let m_usize = usize::try_from(m).expect("m must be non-negative");
9795    let sam = &sa[m_usize..];
9796    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
9797    let block_end =
9798        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
9799    let j = block_end.saturating_sub(67);
9800    let mut f0 = 0;
9801    let mut f1 = 0;
9802    let mut f2 = 0;
9803    let mut f3 = 0;
9804
9805    while i < j {
9806        f0 += SaSint::from(
9807            sam[usize::try_from((sa[i] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9808        );
9809        f1 += SaSint::from(
9810            sam[usize::try_from((sa[i + 1] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9811        );
9812        f2 += SaSint::from(
9813            sam[usize::try_from((sa[i + 2] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9814        );
9815        f3 += SaSint::from(
9816            sam[usize::try_from((sa[i + 3] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9817        );
9818        i += 4;
9819    }
9820
9821    while i < block_end {
9822        f0 += SaSint::from(
9823            sam[usize::try_from((sa[i] as SaUint) >> 1).expect("name slot must fit usize")] < 0,
9824        );
9825        i += 1;
9826    }
9827
9828    f0 + f1 + f2 + f3
9829}
9830
9831/// Internal helper: renumber unique and nonunique lms suffixes 32s (OpenMP variant).
9832#[doc(hidden)]
9833pub fn renumber_unique_and_nonunique_lms_suffixes_32s_omp(
9834    t: &mut [SaSint],
9835    sa: &mut [SaSint],
9836    m: SaSint,
9837    threads: SaSint,
9838    thread_state: &mut [ThreadState],
9839) -> SaSint {
9840    let mut f = 0;
9841    if threads == 1 || m < 65_536 {
9842        f = renumber_unique_and_nonunique_lms_suffixes_32s(t, sa, m, 0, 0, m as FastSint);
9843    } else {
9844        let threads_usize = usize::try_from(threads)
9845            .expect("threads must be non-negative")
9846            .max(1);
9847        let m_usize = usize::try_from(m).expect("m must be non-negative");
9848        let omp_num_threads = threads_usize.min(m_usize.max(1));
9849        let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
9850
9851        for omp_thread_num in 0..omp_num_threads {
9852            let omp_block_start = omp_thread_num * omp_block_stride;
9853            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9854                omp_block_stride
9855            } else {
9856                m_usize - omp_block_start
9857            };
9858
9859            thread_state[omp_thread_num].count = count_unique_suffixes(
9860                sa,
9861                m,
9862                omp_block_start as FastSint,
9863                omp_block_size as FastSint,
9864            ) as FastSint;
9865        }
9866
9867        let mut count = 0 as FastSint;
9868        for omp_thread_num in 0..omp_num_threads {
9869            let omp_block_start = omp_thread_num * omp_block_stride;
9870            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9871                omp_block_stride
9872            } else {
9873                m_usize - omp_block_start
9874            };
9875
9876            if omp_thread_num + 1 == omp_num_threads {
9877                f = (count + thread_state[omp_thread_num].count) as SaSint;
9878            }
9879
9880            renumber_unique_and_nonunique_lms_suffixes_32s(
9881                t,
9882                sa,
9883                m,
9884                count as SaSint,
9885                omp_block_start as FastSint,
9886                omp_block_size as FastSint,
9887            );
9888            count += thread_state[omp_thread_num].count;
9889        }
9890    }
9891
9892    f
9893}
9894
9895/// Internal helper: compact unique and nonunique lms suffixes 32s (OpenMP variant).
9896#[doc(hidden)]
9897pub fn compact_unique_and_nonunique_lms_suffixes_32s_omp(
9898    sa: &mut [SaSint],
9899    n: SaSint,
9900    m: SaSint,
9901    fs: SaSint,
9902    f: SaSint,
9903    threads: SaSint,
9904    thread_state: &mut [ThreadState],
9905) {
9906    let half_n = (n as FastSint) >> 1;
9907    if threads == 1 || n < 131_072 || m >= fs {
9908        let mut l = m as FastSint;
9909        let mut r = n as FastSint + fs as FastSint;
9910        compact_unique_and_nonunique_lms_suffixes_32s(sa, m, &mut l, &mut r, 0, half_n);
9911    } else {
9912        let threads_usize = usize::try_from(threads)
9913            .expect("threads must be non-negative")
9914            .max(1);
9915        let half_n_usize = usize::try_from(half_n).expect("half_n must be non-negative");
9916        let omp_num_threads = threads_usize.min(half_n_usize.max(1));
9917        let omp_block_stride = (half_n_usize / omp_num_threads) & !15usize;
9918
9919        for omp_thread_num in 0..omp_num_threads {
9920            let omp_block_start = omp_thread_num * omp_block_stride;
9921            let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
9922                omp_block_stride
9923            } else {
9924                half_n_usize - omp_block_start
9925            };
9926
9927            thread_state[omp_thread_num].position =
9928                m as FastSint + half_n + omp_block_start as FastSint + omp_block_size as FastSint;
9929            thread_state[omp_thread_num].count =
9930                m as FastSint + omp_block_start as FastSint + omp_block_size as FastSint;
9931
9932            let mut position = thread_state[omp_thread_num].position;
9933            let mut count = thread_state[omp_thread_num].count;
9934            compact_unique_and_nonunique_lms_suffixes_32s(
9935                sa,
9936                m,
9937                &mut position,
9938                &mut count,
9939                omp_block_start as FastSint,
9940                omp_block_size as FastSint,
9941            );
9942            thread_state[omp_thread_num].position = position;
9943            thread_state[omp_thread_num].count = count;
9944        }
9945
9946        let mut position = m as FastSint;
9947        for t in (0..omp_num_threads).rev() {
9948            let omp_block_end = if t + 1 < omp_num_threads {
9949                omp_block_stride * (t + 1)
9950            } else {
9951                half_n_usize
9952            };
9953            let count =
9954                m as FastSint + half_n + omp_block_end as FastSint - thread_state[t].position;
9955            if count > 0 {
9956                position -= count;
9957                let dst = usize::try_from(position).expect("destination must be non-negative");
9958                let src =
9959                    usize::try_from(thread_state[t].position).expect("source must be non-negative");
9960                let len = usize::try_from(count).expect("length must be non-negative");
9961                sa.copy_within(src..src + len, dst);
9962            }
9963        }
9964
9965        let mut position = n as FastSint + fs as FastSint;
9966        for t in (0..omp_num_threads).rev() {
9967            let omp_block_end = if t + 1 < omp_num_threads {
9968                omp_block_stride * (t + 1)
9969            } else {
9970                half_n_usize
9971            };
9972            let count = m as FastSint + omp_block_end as FastSint - thread_state[t].count;
9973            if count > 0 {
9974                position -= count;
9975                let dst = usize::try_from(position).expect("destination must be non-negative");
9976                let src =
9977                    usize::try_from(thread_state[t].count).expect("source must be non-negative");
9978                let len = usize::try_from(count).expect("length must be non-negative");
9979                sa.copy_within(src..src + len, dst);
9980            }
9981        }
9982    }
9983
9984    let copy_dst = usize::try_from(n + fs - m).expect("copy destination must be non-negative");
9985    let copy_src = usize::try_from(m - f).expect("copy source must be non-negative");
9986    let copy_len = usize::try_from(f).expect("copy length must be non-negative");
9987    sa.copy_within(copy_src..copy_src + copy_len, copy_dst);
9988}
9989
9990/// Internal helper: compact lms suffixes 32s (OpenMP variant).
9991#[doc(hidden)]
9992pub fn compact_lms_suffixes_32s_omp(
9993    t: &mut [SaSint],
9994    sa: &mut [SaSint],
9995    n: SaSint,
9996    m: SaSint,
9997    fs: SaSint,
9998    threads: SaSint,
9999    thread_state: &mut [ThreadState],
10000) -> SaSint {
10001    let f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(t, sa, m, threads, thread_state);
10002    compact_unique_and_nonunique_lms_suffixes_32s_omp(sa, n, m, fs, f, threads, thread_state);
10003    f
10004}
10005
10006/// Internal helper: merge unique lms suffixes 32s.
10007#[doc(hidden)]
10008pub fn merge_unique_lms_suffixes_32s(
10009    t: &mut [SaSint],
10010    sa: &mut [SaSint],
10011    n: SaSint,
10012    m: SaSint,
10013    l: FastSint,
10014    omp_block_start: FastSint,
10015    omp_block_size: FastSint,
10016) {
10017    if omp_block_size <= 0 {
10018        return;
10019    }
10020
10021    let n_usize = usize::try_from(n).expect("n must be non-negative");
10022    let m_usize = usize::try_from(m).expect("m must be non-negative");
10023    let mut src_index = n_usize - m_usize - 1 + usize::try_from(l).expect("l must be non-negative");
10024    let mut tmp = sa[src_index] as FastSint;
10025    src_index += 1;
10026
10027    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
10028    let block_end =
10029        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
10030    let j = block_end.saturating_sub(6);
10031    while i < j {
10032        let c0 = t[i];
10033        if c0 < 0 {
10034            t[i] = c0 & SAINT_MAX;
10035            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint;
10036            i += 1;
10037            tmp = sa[src_index] as FastSint;
10038            src_index += 1;
10039        }
10040
10041        let c1 = t[i + 1];
10042        if c1 < 0 {
10043            t[i + 1] = c1 & SAINT_MAX;
10044            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 1;
10045            i += 1;
10046            tmp = sa[src_index] as FastSint;
10047            src_index += 1;
10048        }
10049
10050        let c2 = t[i + 2];
10051        if c2 < 0 {
10052            t[i + 2] = c2 & SAINT_MAX;
10053            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 2;
10054            i += 1;
10055            tmp = sa[src_index] as FastSint;
10056            src_index += 1;
10057        }
10058
10059        let c3 = t[i + 3];
10060        if c3 < 0 {
10061            t[i + 3] = c3 & SAINT_MAX;
10062            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint + 3;
10063            i += 1;
10064            tmp = sa[src_index] as FastSint;
10065            src_index += 1;
10066        }
10067
10068        i += 4;
10069    }
10070
10071    while i < block_end {
10072        let c = t[i];
10073        if c < 0 {
10074            t[i] = c & SAINT_MAX;
10075            sa[usize::try_from(tmp).expect("target slot must be non-negative")] = i as SaSint;
10076            i += 1;
10077            tmp = sa[src_index] as FastSint;
10078            src_index += 1;
10079        }
10080        i += 1;
10081    }
10082}
10083
10084/// Internal helper: merge nonunique lms suffixes 32s.
10085#[doc(hidden)]
10086pub fn merge_nonunique_lms_suffixes_32s(
10087    sa: &mut [SaSint],
10088    n: SaSint,
10089    m: SaSint,
10090    l: FastSint,
10091    omp_block_start: FastSint,
10092    omp_block_size: FastSint,
10093) {
10094    if omp_block_size <= 0 {
10095        return;
10096    }
10097
10098    let n_usize = usize::try_from(n).expect("n must be non-negative");
10099    let m_usize = usize::try_from(m).expect("m must be non-negative");
10100    let mut src_index = n_usize - m_usize - 1 + usize::try_from(l).expect("l must be non-negative");
10101    let mut tmp = sa[src_index];
10102    src_index += 1;
10103
10104    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
10105    let block_end =
10106        i + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
10107    let j = block_end.saturating_sub(3);
10108    while i < j {
10109        if sa[i] == 0 {
10110            sa[i] = tmp;
10111            tmp = sa[src_index];
10112            src_index += 1;
10113        }
10114        if sa[i + 1] == 0 {
10115            sa[i + 1] = tmp;
10116            tmp = sa[src_index];
10117            src_index += 1;
10118        }
10119        if sa[i + 2] == 0 {
10120            sa[i + 2] = tmp;
10121            tmp = sa[src_index];
10122            src_index += 1;
10123        }
10124        if sa[i + 3] == 0 {
10125            sa[i + 3] = tmp;
10126            tmp = sa[src_index];
10127            src_index += 1;
10128        }
10129        i += 4;
10130    }
10131
10132    while i < block_end {
10133        if sa[i] == 0 {
10134            sa[i] = tmp;
10135            tmp = sa[src_index];
10136            src_index += 1;
10137        }
10138        i += 1;
10139    }
10140}
10141
10142/// Internal helper: merge unique lms suffixes 32s (OpenMP variant).
10143#[doc(hidden)]
10144pub fn merge_unique_lms_suffixes_32s_omp(
10145    t: &mut [SaSint],
10146    sa: &mut [SaSint],
10147    n: SaSint,
10148    m: SaSint,
10149    threads: SaSint,
10150    thread_state: &mut [ThreadState],
10151) {
10152    if threads == 1 || n < 65_536 {
10153        merge_unique_lms_suffixes_32s(t, sa, n, m, 0, 0, n as FastSint);
10154        return;
10155    }
10156
10157    let threads_usize = usize::try_from(threads)
10158        .expect("threads must be non-negative")
10159        .max(1);
10160    let n_usize = usize::try_from(n).expect("n must be non-negative");
10161    let omp_num_threads = threads_usize.min(n_usize.max(1));
10162    let omp_block_stride = (n_usize / omp_num_threads) & !15usize;
10163
10164    for omp_thread_num in 0..omp_num_threads {
10165        let omp_block_start = omp_thread_num * omp_block_stride;
10166        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10167            omp_block_stride
10168        } else {
10169            n_usize - omp_block_start
10170        };
10171
10172        thread_state[omp_thread_num].count = count_negative_marked_suffixes(
10173            t,
10174            omp_block_start as FastSint,
10175            omp_block_size as FastSint,
10176        ) as FastSint;
10177    }
10178
10179    let mut count = 0 as FastSint;
10180    for omp_thread_num in 0..omp_num_threads {
10181        let omp_block_start = omp_thread_num * omp_block_stride;
10182        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10183            omp_block_stride
10184        } else {
10185            n_usize - omp_block_start
10186        };
10187
10188        merge_unique_lms_suffixes_32s(
10189            t,
10190            sa,
10191            n,
10192            m,
10193            count,
10194            omp_block_start as FastSint,
10195            omp_block_size as FastSint,
10196        );
10197        count += thread_state[omp_thread_num].count;
10198    }
10199}
10200
10201/// Internal helper: merge nonunique lms suffixes 32s (OpenMP variant).
10202#[doc(hidden)]
10203pub fn merge_nonunique_lms_suffixes_32s_omp(
10204    sa: &mut [SaSint],
10205    n: SaSint,
10206    m: SaSint,
10207    f: SaSint,
10208    threads: SaSint,
10209    thread_state: &mut [ThreadState],
10210) {
10211    if threads == 1 || m < 65_536 {
10212        merge_nonunique_lms_suffixes_32s(sa, n, m, f as FastSint, 0, m as FastSint);
10213        return;
10214    }
10215
10216    let threads_usize = usize::try_from(threads)
10217        .expect("threads must be non-negative")
10218        .max(1);
10219    let m_usize = usize::try_from(m).expect("m must be non-negative");
10220    let omp_num_threads = threads_usize.min(m_usize.max(1));
10221    let omp_block_stride = (m_usize / omp_num_threads) & !15usize;
10222
10223    for omp_thread_num in 0..omp_num_threads {
10224        let omp_block_start = omp_thread_num * omp_block_stride;
10225        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10226            omp_block_stride
10227        } else {
10228            m_usize - omp_block_start
10229        };
10230
10231        thread_state[omp_thread_num].count =
10232            count_zero_marked_suffixes(sa, omp_block_start as FastSint, omp_block_size as FastSint)
10233                as FastSint;
10234    }
10235
10236    let mut count = f as FastSint;
10237    for omp_thread_num in 0..omp_num_threads {
10238        let omp_block_start = omp_thread_num * omp_block_stride;
10239        let omp_block_size = if omp_thread_num + 1 < omp_num_threads {
10240            omp_block_stride
10241        } else {
10242            m_usize - omp_block_start
10243        };
10244
10245        merge_nonunique_lms_suffixes_32s(
10246            sa,
10247            n,
10248            m,
10249            count,
10250            omp_block_start as FastSint,
10251            omp_block_size as FastSint,
10252        );
10253        count += thread_state[omp_thread_num].count;
10254    }
10255}
10256
10257/// Internal helper: merge compacted lms suffixes 32s (OpenMP variant).
10258#[doc(hidden)]
10259pub fn merge_compacted_lms_suffixes_32s_omp(
10260    t: &mut [SaSint],
10261    sa: &mut [SaSint],
10262    n: SaSint,
10263    m: SaSint,
10264    f: SaSint,
10265    threads: SaSint,
10266    thread_state: &mut [ThreadState],
10267) {
10268    merge_unique_lms_suffixes_32s_omp(t, sa, n, m, threads, thread_state);
10269    merge_nonunique_lms_suffixes_32s_omp(sa, n, m, f, threads, thread_state);
10270}
10271
10272/// Internal helper: reconstruct compacted lms suffixes 32s 2k (OpenMP variant).
10273#[doc(hidden)]
10274pub fn reconstruct_compacted_lms_suffixes_32s_2k_omp(
10275    t: &mut [SaSint],
10276    sa: &mut [SaSint],
10277    n: SaSint,
10278    k: SaSint,
10279    m: SaSint,
10280    fs: SaSint,
10281    f: SaSint,
10282    buckets: &mut [SaSint],
10283    local_buckets: SaSint,
10284    threads: SaSint,
10285    thread_state: &mut [ThreadState],
10286) {
10287    if f > 0 {
10288        let dst = usize::try_from(n - m - 1).expect("destination must be non-negative");
10289        let src = usize::try_from(n + fs - m).expect("source must be non-negative");
10290        let len = usize::try_from(f).expect("length must be non-negative");
10291        sa.copy_within(src..src + len, dst);
10292
10293        let _ = count_and_gather_compacted_lms_suffixes_32s_2k_omp(
10294            t,
10295            sa,
10296            n,
10297            k,
10298            buckets,
10299            local_buckets,
10300            threads,
10301            thread_state,
10302        );
10303        reconstruct_lms_suffixes_omp(sa, n, m - f, threads);
10304
10305        let src_copy = 0usize;
10306        let dst_copy = usize::try_from(n - m - 1 + f).expect("destination must be non-negative");
10307        let copy_len = usize::try_from(m - f).expect("copy length must be non-negative");
10308        sa.copy_within(src_copy..src_copy + copy_len, dst_copy);
10309        sa[..usize::try_from(m).expect("m must be non-negative")].fill(0);
10310
10311        merge_compacted_lms_suffixes_32s_omp(t, sa, n, m, f, threads, thread_state);
10312    } else {
10313        let _ = count_and_gather_lms_suffixes_32s_2k(t, sa, n, k, buckets, 0, n as FastSint);
10314        reconstruct_lms_suffixes_omp(sa, n, m, threads);
10315    }
10316}
10317
10318/// Internal helper: reconstruct compacted lms suffixes 32s 1k (OpenMP variant).
10319#[doc(hidden)]
10320pub fn reconstruct_compacted_lms_suffixes_32s_1k_omp(
10321    t: &mut [SaSint],
10322    sa: &mut [SaSint],
10323    n: SaSint,
10324    m: SaSint,
10325    fs: SaSint,
10326    f: SaSint,
10327    threads: SaSint,
10328    thread_state: &mut [ThreadState],
10329) {
10330    if f > 0 {
10331        let dst = usize::try_from(n - m - 1).expect("destination must be non-negative");
10332        let src = usize::try_from(n + fs - m).expect("source must be non-negative");
10333        let len = usize::try_from(f).expect("length must be non-negative");
10334        sa.copy_within(src..src + len, dst);
10335
10336        let _ = gather_compacted_lms_suffixes_32s(t, sa, n);
10337        reconstruct_lms_suffixes_omp(sa, n, m - f, threads);
10338
10339        let dst_copy = usize::try_from(n - m - 1 + f).expect("destination must be non-negative");
10340        let copy_len = usize::try_from(m - f).expect("copy length must be non-negative");
10341        sa.copy_within(0..copy_len, dst_copy);
10342        sa[..usize::try_from(m).expect("m must be non-negative")].fill(0);
10343
10344        merge_compacted_lms_suffixes_32s_omp(t, sa, n, m, f, threads, thread_state);
10345    } else {
10346        let _ = gather_lms_suffixes_32s(t, sa, n);
10347        reconstruct_lms_suffixes_omp(sa, n, m, threads);
10348    }
10349}
10350
10351fn normalize_omp_threads(threads: SaSint) -> SaSint {
10352    if threads > 0 {
10353        threads
10354    } else {
10355        std::thread::available_parallelism()
10356            .map(|value| value.get() as SaSint)
10357            .unwrap_or(1)
10358            .max(1)
10359    }
10360}
10361
10362fn libsais_main_32s_recursion(
10363    t_ptr: *mut SaSint,
10364    sa_ptr: *mut SaSint,
10365    sa_capacity: usize,
10366    n: SaSint,
10367    k: SaSint,
10368    fs: SaSint,
10369    threads: SaSint,
10370    thread_state: &mut [ThreadState],
10371    _local_buffer: &mut [SaSint],
10372) -> SaSint {
10373    let fs = fs.min(SAINT_MAX - n);
10374    let local_buffer_size = SaSint::try_from(LIBSAIS_LOCAL_BUFFER_SIZE).expect("fits");
10375    let n_usize = usize::try_from(n).expect("n must be non-negative");
10376    let fs_usize = usize::try_from(fs).expect("fs must be non-negative");
10377    let total_len = n_usize + fs_usize;
10378    assert!(total_len <= sa_capacity);
10379
10380    if k > 0 && ((fs / k) >= 6 || (local_buffer_size / k) >= 6) {
10381        let k_usize = usize::try_from(k).expect("k must be non-negative");
10382        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 6 {
10383            1024usize
10384        } else {
10385            16usize
10386        };
10387        let need = 6 * k_usize;
10388        let use_local_buffer = local_buffer_size > fs;
10389        let mut bucket_free_space = SaSint::from(use_local_buffer);
10390        let buckets_ptr = if use_local_buffer {
10391            _local_buffer.as_mut_ptr()
10392        } else {
10393            unsafe {
10394                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10395                let start =
10396                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 6 {
10397                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10398                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10399                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10400                    } else {
10401                        total_len - need
10402                    };
10403                bucket_free_space =
10404                    SaSint::try_from(start - n_usize).expect("bucket free space must fit SaSint");
10405                sa[start..].as_mut_ptr()
10406            }
10407        };
10408
10409        let m = unsafe {
10410            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10411            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10412            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10413            count_and_gather_lms_suffixes_32s_4k_omp(
10414                t,
10415                sa,
10416                n,
10417                k,
10418                buckets,
10419                bucket_free_space,
10420                threads,
10421                thread_state,
10422            )
10423        };
10424        if m > 1 {
10425            let m_usize = usize::try_from(m).expect("m must be non-negative");
10426            unsafe {
10427                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10428                sa[..n_usize - m_usize].fill(0);
10429            }
10430
10431            let first_lms_suffix = unsafe {
10432                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10433                sa[n_usize - m_usize]
10434            };
10435            let left_suffixes_count = unsafe {
10436                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10437                initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
10438                    std::slice::from_raw_parts_mut(t_ptr, n_usize),
10439                    k,
10440                    buckets,
10441                    first_lms_suffix,
10442                )
10443            };
10444
10445            unsafe {
10446                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10447                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10448                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10449                let (_, induction_bucket) = buckets.split_at_mut(4 * k_usize);
10450                radix_sort_lms_suffixes_32s_6k_omp(
10451                    t,
10452                    sa,
10453                    n,
10454                    m,
10455                    induction_bucket,
10456                    threads,
10457                    thread_state,
10458                );
10459                if (n / 8192) < k {
10460                    radix_sort_set_markers_32s_6k_omp(sa, k, induction_bucket, threads);
10461                }
10462                if threads > 1 && n >= 65_536 {
10463                    sa[n_usize - m_usize..n_usize].fill(0);
10464                }
10465                initialize_buckets_for_partial_sorting_32s_6k(
10466                    t,
10467                    k,
10468                    buckets,
10469                    first_lms_suffix,
10470                    left_suffixes_count,
10471                );
10472                induce_partial_order_32s_6k_omp(
10473                    t,
10474                    sa,
10475                    n,
10476                    k,
10477                    buckets,
10478                    first_lms_suffix,
10479                    left_suffixes_count,
10480                    threads,
10481                    thread_state,
10482                );
10483            }
10484
10485            let names = unsafe {
10486                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10487                if (n / 8192) < k {
10488                    renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
10489                        sa,
10490                        n,
10491                        m,
10492                        threads,
10493                        thread_state,
10494                    )
10495                } else {
10496                    renumber_and_gather_lms_suffixes_omp(sa, n, m, fs, threads, thread_state)
10497                }
10498            };
10499
10500            if names < m {
10501                let f = if (n / 8192) < k {
10502                    unsafe {
10503                        let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10504                        let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10505                        compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10506                    }
10507                } else {
10508                    0
10509                };
10510
10511                let new_t_start =
10512                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10513                let recursive_n = m - f;
10514                let recursive_fs = fs + n - 2 * m + f;
10515                if libsais_main_32s_recursion(
10516                    unsafe {
10517                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10518                            .as_mut_ptr()
10519                    },
10520                    sa_ptr,
10521                    sa_capacity,
10522                    recursive_n,
10523                    names - f,
10524                    recursive_fs,
10525                    threads,
10526                    thread_state,
10527                    _local_buffer,
10528                ) != 0
10529                {
10530                    return -2;
10531                }
10532
10533                unsafe {
10534                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10535                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10536                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10537                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10538                        t,
10539                        sa,
10540                        n,
10541                        k,
10542                        m,
10543                        fs,
10544                        f,
10545                        buckets,
10546                        SaSint::from(use_local_buffer),
10547                        threads,
10548                        thread_state,
10549                    );
10550                }
10551            } else {
10552                unsafe {
10553                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10554                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10555                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10556                }
10557            }
10558
10559            unsafe {
10560                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10561                initialize_buckets_start_and_end_32s_4k(k, buckets);
10562                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10563                place_lms_suffixes_histogram_32s_4k(sa, n, k, m, buckets);
10564                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10565                induce_final_order_32s_4k(t, sa, n, k, buckets, threads, thread_state);
10566            }
10567        } else {
10568            unsafe {
10569                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10570                sa[0] = sa[n_usize - 1];
10571            }
10572
10573            unsafe {
10574                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10575                initialize_buckets_start_and_end_32s_6k(k, buckets);
10576                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10577                place_lms_suffixes_histogram_32s_6k(sa, n, k, m, buckets);
10578                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10579                induce_final_order_32s_6k(t, sa, n, k, buckets, threads, thread_state);
10580            }
10581        }
10582
10583        return 0;
10584    } else if k > 0 && n <= SAINT_MAX / 2 && ((fs / k) >= 4 || (local_buffer_size / k) >= 4) {
10585        let k_usize = usize::try_from(k).expect("k must be non-negative");
10586        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 4 {
10587            1024usize
10588        } else {
10589            16usize
10590        };
10591        let need = 4 * k_usize;
10592        let use_local_buffer = local_buffer_size > fs;
10593        let mut bucket_free_space = SaSint::from(use_local_buffer);
10594        let buckets_ptr = if use_local_buffer {
10595            _local_buffer.as_mut_ptr()
10596        } else {
10597            unsafe {
10598                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10599                let start =
10600                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 4 {
10601                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10602                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10603                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10604                    } else {
10605                        total_len - need
10606                    };
10607                bucket_free_space =
10608                    SaSint::try_from(start - n_usize).expect("bucket free space must fit SaSint");
10609                sa[start..].as_mut_ptr()
10610            }
10611        };
10612
10613        let m = unsafe {
10614            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10615            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10616            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10617            count_and_gather_lms_suffixes_32s_2k_omp(
10618                t,
10619                sa,
10620                n,
10621                k,
10622                buckets,
10623                bucket_free_space,
10624                threads,
10625                thread_state,
10626            )
10627        };
10628        if m > 1 {
10629            let m_usize = usize::try_from(m).expect("m must be non-negative");
10630            unsafe {
10631                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10632                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10633                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10634                initialize_buckets_for_radix_and_partial_sorting_32s_4k(
10635                    t,
10636                    k,
10637                    buckets,
10638                    sa[n_usize - m_usize],
10639                );
10640                let (_, induction_bucket) = buckets.split_at_mut(1);
10641                radix_sort_lms_suffixes_32s_2k_omp(
10642                    t,
10643                    sa,
10644                    n,
10645                    m,
10646                    induction_bucket,
10647                    threads,
10648                    thread_state,
10649                );
10650                radix_sort_set_markers_32s_4k_omp(sa, k, induction_bucket, threads);
10651                place_lms_suffixes_interval_32s_4k(sa, n, k, m - 1, buckets);
10652                induce_partial_order_32s_4k_omp(t, sa, n, k, buckets, threads, thread_state);
10653            }
10654
10655            let names = unsafe {
10656                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10657                renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa, n, m, threads, thread_state)
10658            };
10659            if names < m {
10660                let f = unsafe {
10661                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10662                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10663                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10664                };
10665
10666                let new_t_start =
10667                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10668                if libsais_main_32s_recursion(
10669                    unsafe {
10670                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10671                            .as_mut_ptr()
10672                    },
10673                    sa_ptr,
10674                    sa_capacity,
10675                    m - f,
10676                    names - f,
10677                    fs + n - 2 * m + f,
10678                    threads,
10679                    thread_state,
10680                    _local_buffer,
10681                ) != 0
10682                {
10683                    return -2;
10684                }
10685
10686                unsafe {
10687                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10688                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10689                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10690                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10691                        t,
10692                        sa,
10693                        n,
10694                        k,
10695                        m,
10696                        fs,
10697                        f,
10698                        buckets,
10699                        SaSint::from(use_local_buffer),
10700                        threads,
10701                        thread_state,
10702                    );
10703                }
10704            } else {
10705                unsafe {
10706                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10707                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10708                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10709                }
10710            }
10711        } else {
10712            unsafe {
10713                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10714                sa[0] = sa[n_usize - 1];
10715            }
10716        }
10717
10718        unsafe {
10719            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10720            initialize_buckets_start_and_end_32s_4k(k, buckets);
10721            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10722            place_lms_suffixes_histogram_32s_4k(sa, n, k, m, buckets);
10723            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10724            induce_final_order_32s_4k(t, sa, n, k, buckets, threads, thread_state);
10725        }
10726
10727        return 0;
10728    } else if k > 0 && ((fs / k) >= 2 || (local_buffer_size / k) >= 2) {
10729        let k_usize = usize::try_from(k).expect("k must be non-negative");
10730        let alignment = if fs >= 1024 && ((fs - 1024) / k) >= 2 {
10731            1024usize
10732        } else {
10733            16usize
10734        };
10735        let need = 2 * k_usize;
10736        let use_local_buffer = local_buffer_size > fs;
10737        let mut bucket_free_space = SaSint::from(use_local_buffer);
10738        let buckets_ptr = if use_local_buffer {
10739            _local_buffer.as_mut_ptr()
10740        } else {
10741            unsafe {
10742                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10743                let start =
10744                    if fs_usize >= need + alignment && ((fs_usize - alignment) / k_usize) >= 2 {
10745                        let byte_ptr = sa[total_len - need - alignment..].as_mut_ptr() as usize;
10746                        let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10747                        (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10748                    } else {
10749                        total_len - need
10750                    };
10751                bucket_free_space =
10752                    SaSint::try_from(start - n_usize).expect("bucket free space must fit SaSint");
10753                sa[start..].as_mut_ptr()
10754            }
10755        };
10756
10757        let m = unsafe {
10758            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10759            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10760            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10761            count_and_gather_lms_suffixes_32s_2k_omp(
10762                t,
10763                sa,
10764                n,
10765                k,
10766                buckets,
10767                bucket_free_space,
10768                threads,
10769                thread_state,
10770            )
10771        };
10772        if m > 1 {
10773            let m_usize = usize::try_from(m).expect("m must be non-negative");
10774            unsafe {
10775                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10776                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10777                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10778                initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
10779                    t,
10780                    k,
10781                    buckets,
10782                    sa[n_usize - m_usize],
10783                );
10784                let (_, induction_bucket) = buckets.split_at_mut(1);
10785                radix_sort_lms_suffixes_32s_2k_omp(
10786                    t,
10787                    sa,
10788                    n,
10789                    m,
10790                    induction_bucket,
10791                    threads,
10792                    thread_state,
10793                );
10794                place_lms_suffixes_interval_32s_2k(sa, n, k, m - 1, buckets);
10795            }
10796
10797            unsafe {
10798                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10799                initialize_buckets_start_and_end_32s_2k(k, buckets);
10800                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10801                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10802                induce_partial_order_32s_2k_omp(t, sa, n, k, buckets, threads, thread_state);
10803            }
10804
10805            let names = unsafe {
10806                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10807                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10808                renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(t, sa, n, m, threads)
10809            };
10810            if names < m {
10811                let f = unsafe {
10812                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10813                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10814                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10815                };
10816
10817                let new_t_start =
10818                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10819                if libsais_main_32s_recursion(
10820                    unsafe {
10821                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10822                            .as_mut_ptr()
10823                    },
10824                    sa_ptr,
10825                    sa_capacity,
10826                    m - f,
10827                    names - f,
10828                    fs + n - 2 * m + f,
10829                    threads,
10830                    thread_state,
10831                    _local_buffer,
10832                ) != 0
10833                {
10834                    return -2;
10835                }
10836
10837                unsafe {
10838                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10839                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10840                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10841                    reconstruct_compacted_lms_suffixes_32s_2k_omp(
10842                        t,
10843                        sa,
10844                        n,
10845                        k,
10846                        m,
10847                        fs,
10848                        f,
10849                        buckets,
10850                        SaSint::from(use_local_buffer),
10851                        threads,
10852                        thread_state,
10853                    );
10854                }
10855            } else {
10856                unsafe {
10857                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10858                    let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10859                    count_lms_suffixes_32s_2k(t, n, k, buckets);
10860                }
10861            }
10862        } else {
10863            unsafe {
10864                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10865                sa[0] = sa[n_usize - 1];
10866            }
10867        }
10868
10869        unsafe {
10870            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10871            initialize_buckets_end_32s_2k(k, buckets);
10872            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10873            place_lms_suffixes_histogram_32s_2k(sa, n, k, m, buckets);
10874        }
10875
10876        unsafe {
10877            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, need);
10878            initialize_buckets_start_and_end_32s_2k(k, buckets);
10879            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10880            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10881            induce_final_order_32s_2k(t, sa, n, k, buckets, threads, thread_state);
10882        }
10883
10884        return 0;
10885    } else {
10886        let k_usize = usize::try_from(k).expect("k must be non-negative");
10887        let mut heap_buckets = if fs < k { Some(vec![0; k_usize]) } else { None };
10888        let alignment = if fs >= 1024 && (fs - 1024) >= k {
10889            1024usize
10890        } else {
10891            16usize
10892        };
10893        let mut buckets_ptr = if let Some(ref mut heap) = heap_buckets {
10894            heap.as_mut_ptr()
10895        } else {
10896            unsafe {
10897                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10898                let start = if fs_usize >= k_usize + alignment {
10899                    let byte_ptr = sa[total_len - k_usize - alignment..].as_mut_ptr() as usize;
10900                    let aligned = align_up(byte_ptr, alignment * mem::size_of::<SaSint>());
10901                    (aligned - sa_ptr as usize) / mem::size_of::<SaSint>()
10902                } else {
10903                    total_len - k_usize
10904                };
10905                sa[start..].as_mut_ptr()
10906            }
10907        };
10908
10909        if buckets_ptr.is_null() {
10910            return -2;
10911        }
10912
10913        unsafe {
10914            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10915            sa[..n_usize].fill(0);
10916        }
10917
10918        unsafe {
10919            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10920            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10921            count_suffixes_32s(t, n, k, buckets);
10922        }
10923        unsafe {
10924            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10925            initialize_buckets_end_32s_1k(k, buckets);
10926        }
10927
10928        let m = unsafe {
10929            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10930            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10931            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10932            radix_sort_lms_suffixes_32s_1k(t, sa, n, buckets)
10933        };
10934        if m > 1 {
10935            unsafe {
10936                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10937                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10938                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
10939                induce_partial_order_32s_1k_omp(t, sa, n, k, buckets, threads, thread_state);
10940            }
10941
10942            let names = unsafe {
10943                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10944                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10945                renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(t, sa, n, m, threads)
10946            };
10947            if names < m {
10948                if heap_buckets.is_some() {
10949                    let _ = heap_buckets.take();
10950                    buckets_ptr = std::ptr::null_mut();
10951                }
10952
10953                let f = unsafe {
10954                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10955                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10956                    compact_lms_suffixes_32s_omp(t, sa, n, m, fs, threads, thread_state)
10957                };
10958
10959                let new_t_start =
10960                    total_len - usize::try_from(m - f).expect("m - f must be non-negative");
10961                if libsais_main_32s_recursion(
10962                    unsafe {
10963                        std::slice::from_raw_parts_mut(sa_ptr, total_len)[new_t_start..]
10964                            .as_mut_ptr()
10965                    },
10966                    sa_ptr,
10967                    sa_capacity,
10968                    m - f,
10969                    names - f,
10970                    fs + n - 2 * m + f,
10971                    threads,
10972                    thread_state,
10973                    _local_buffer,
10974                ) != 0
10975                {
10976                    return -2;
10977                }
10978
10979                unsafe {
10980                    let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
10981                    let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
10982                    reconstruct_compacted_lms_suffixes_32s_1k_omp(
10983                        t,
10984                        sa,
10985                        n,
10986                        m,
10987                        fs,
10988                        f,
10989                        threads,
10990                        thread_state,
10991                    );
10992                }
10993
10994                if buckets_ptr.is_null() {
10995                    heap_buckets = Some(vec![0; k_usize]);
10996                    buckets_ptr = heap_buckets
10997                        .as_mut()
10998                        .expect("heap buckets must exist")
10999                        .as_mut_ptr();
11000                    if buckets_ptr.is_null() {
11001                        return -2;
11002                    }
11003                }
11004            }
11005
11006            unsafe {
11007                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11008                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11009                count_suffixes_32s(t, n, k, buckets);
11010            }
11011            unsafe {
11012                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11013                initialize_buckets_end_32s_1k(k, buckets);
11014            }
11015            unsafe {
11016                let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11017                let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
11018                let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11019                place_lms_suffixes_interval_32s_1k(t, sa, k, m, buckets);
11020            }
11021        }
11022
11023        unsafe {
11024            let t = std::slice::from_raw_parts_mut(t_ptr, n_usize);
11025            let sa = std::slice::from_raw_parts_mut(sa_ptr, total_len);
11026            let buckets = std::slice::from_raw_parts_mut(buckets_ptr, k_usize);
11027            induce_final_order_32s_1k(t, sa, n, k, buckets, threads, thread_state);
11028        }
11029
11030        0
11031    }
11032}
11033
11034fn libsais_main_32s_entry(
11035    t: &mut [SaSint],
11036    sa: &mut [SaSint],
11037    n: SaSint,
11038    k: SaSint,
11039    fs: SaSint,
11040    threads: SaSint,
11041    thread_state: &mut [ThreadState],
11042) -> SaSint {
11043    let mut local_buffer = [0; 2 * LIBSAIS_LOCAL_BUFFER_SIZE];
11044    libsais_main_32s_recursion(
11045        t.as_mut_ptr(),
11046        sa.as_mut_ptr(),
11047        sa.len(),
11048        n,
11049        k,
11050        fs,
11051        threads,
11052        thread_state,
11053        &mut local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE..],
11054    )
11055}
11056
11057fn libsais_main_8u(
11058    t: &[u8],
11059    sa: &mut [SaSint],
11060    buckets: &mut [SaSint],
11061    flags: SaSint,
11062    r: SaSint,
11063    i: Option<&mut [SaSint]>,
11064    fs: SaSint,
11065    freq: Option<&mut [SaSint]>,
11066    threads: SaSint,
11067    thread_state: &mut [ThreadState],
11068) -> SaSint {
11069    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
11070    let n_usize = usize::try_from(n).expect("n must be non-negative");
11071    let fs = fs.min(SAINT_MAX - n);
11072
11073    let m = count_and_gather_lms_suffixes_8u_omp(t, sa, n, buckets, threads, thread_state);
11074    let k = initialize_buckets_start_and_end_8u(buckets, freq);
11075
11076    if (flags & LIBSAIS_FLAGS_GSA) != 0 && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1) {
11077        return -1;
11078    }
11079
11080    if m > 0 {
11081        let m_usize = usize::try_from(m).expect("m must be non-negative");
11082        let first_lms_suffix = sa[n_usize - m_usize];
11083        let left_suffixes_count =
11084            initialize_buckets_for_lms_suffixes_radix_sort_8u(t, buckets, first_lms_suffix);
11085
11086        if threads > 1 && n >= 65_536 {
11087            sa[..n_usize - m_usize].fill(0);
11088        }
11089        radix_sort_lms_suffixes_8u_omp(t, sa, n, m, flags, buckets, threads, thread_state);
11090        if threads > 1 && n >= 65_536 {
11091            sa[n_usize - m_usize..n_usize].fill(0);
11092        }
11093
11094        initialize_buckets_for_partial_sorting_8u(
11095            t,
11096            buckets,
11097            first_lms_suffix,
11098            left_suffixes_count,
11099        );
11100        induce_partial_order_8u_omp(
11101            t,
11102            sa,
11103            n,
11104            k,
11105            flags,
11106            buckets,
11107            first_lms_suffix,
11108            left_suffixes_count,
11109            threads,
11110            thread_state,
11111        );
11112
11113        let names = renumber_and_gather_lms_suffixes_omp(sa, n, m, fs, threads, thread_state);
11114        if names < m {
11115            let recursive_text_start =
11116                n_usize + usize::try_from(fs).expect("fs must be non-negative") - m_usize;
11117            let recursive_fs = fs + n - 2 * m;
11118
11119            let index = libsais_main_32s_entry(
11120                unsafe {
11121                    std::slice::from_raw_parts_mut(sa[recursive_text_start..].as_mut_ptr(), m_usize)
11122                },
11123                sa,
11124                m,
11125                names,
11126                recursive_fs,
11127                threads,
11128                thread_state,
11129            );
11130
11131            if index != 0 {
11132                return -2;
11133            }
11134
11135            gather_lms_suffixes_8u_omp(t, sa, n, threads, thread_state);
11136            reconstruct_lms_suffixes_omp(sa, n, m, threads);
11137        }
11138
11139        place_lms_suffixes_interval_8u(sa, n, m, flags, buckets);
11140    } else {
11141        sa[..n_usize].fill(0);
11142    }
11143
11144    induce_final_order_8u_omp(t, sa, n, k, flags, r, i, buckets, threads, thread_state)
11145}
11146
11147fn libsais_main(
11148    t: &[u8],
11149    sa: &mut [SaSint],
11150    flags: SaSint,
11151    r: SaSint,
11152    i: Option<&mut [SaSint]>,
11153    fs: SaSint,
11154    freq: Option<&mut [SaSint]>,
11155    threads: SaSint,
11156) -> SaSint {
11157    let threads = normalize_omp_threads(threads);
11158    if threads > 1 {
11159        let mut thread_state = match alloc_thread_state(threads) {
11160            Some(thread_state) => thread_state,
11161            None => return -2,
11162        };
11163        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
11164
11165        libsais_main_8u(
11166            t,
11167            sa,
11168            &mut buckets,
11169            flags,
11170            r,
11171            i,
11172            fs,
11173            freq,
11174            threads,
11175            &mut thread_state,
11176        )
11177    } else {
11178        let mut thread_state = [];
11179        let mut buckets = [0; 8 * ALPHABET_SIZE];
11180
11181        libsais_main_8u(
11182            t,
11183            sa,
11184            &mut buckets,
11185            flags,
11186            r,
11187            i,
11188            fs,
11189            freq,
11190            threads,
11191            &mut thread_state,
11192        )
11193    }
11194}
11195
11196fn libsais_main_int(
11197    t: &mut [SaSint],
11198    sa: &mut [SaSint],
11199    k: SaSint,
11200    fs: SaSint,
11201    threads: SaSint,
11202) -> SaSint {
11203    let threads = normalize_omp_threads(threads);
11204    let mut thread_state = if threads > 1 {
11205        match alloc_thread_state(threads) {
11206            Some(thread_state) => thread_state,
11207            None => return -2,
11208        }
11209    } else {
11210        Vec::new()
11211    };
11212
11213    libsais_main_32s_entry(
11214        t,
11215        sa,
11216        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
11217        k,
11218        fs,
11219        threads,
11220        &mut thread_state,
11221    )
11222}
11223
11224#[allow(dead_code)]
11225fn main_32s_recursion(
11226    t_ptr: *mut SaSint,
11227    sa_ptr: *mut SaSint,
11228    sa_capacity: usize,
11229    n: SaSint,
11230    k: SaSint,
11231    fs: SaSint,
11232    threads: SaSint,
11233    thread_state: &mut [ThreadState],
11234    local_buffer: &mut [SaSint],
11235) -> SaSint {
11236    libsais_main_32s_recursion(
11237        t_ptr,
11238        sa_ptr,
11239        sa_capacity,
11240        n,
11241        k,
11242        fs,
11243        threads,
11244        thread_state,
11245        local_buffer,
11246    )
11247}
11248
11249#[allow(dead_code)]
11250fn main_32s_entry(
11251    t: &mut [SaSint],
11252    sa: &mut [SaSint],
11253    n: SaSint,
11254    k: SaSint,
11255    fs: SaSint,
11256    threads: SaSint,
11257    thread_state: &mut [ThreadState],
11258) -> SaSint {
11259    libsais_main_32s_entry(t, sa, n, k, fs, threads, thread_state)
11260}
11261
11262#[allow(dead_code)]
11263fn main_8u(
11264    t: &[u8],
11265    sa: &mut [SaSint],
11266    buckets: &mut [SaSint],
11267    flags: SaSint,
11268    r: SaSint,
11269    i: Option<&mut [SaSint]>,
11270    fs: SaSint,
11271    freq: Option<&mut [SaSint]>,
11272    threads: SaSint,
11273    thread_state: &mut [ThreadState],
11274) -> SaSint {
11275    libsais_main_8u(t, sa, buckets, flags, r, i, fs, freq, threads, thread_state)
11276}
11277
11278#[allow(dead_code)]
11279fn main_int(t: &mut [SaSint], sa: &mut [SaSint], k: SaSint, fs: SaSint, threads: SaSint) -> SaSint {
11280    libsais_main_int(t, sa, k, fs, threads)
11281}
11282
11283fn libsais_main_ctx(
11284    ctx: &mut Context,
11285    t: &[u8],
11286    sa: &mut [SaSint],
11287    flags: SaSint,
11288    r: SaSint,
11289    i: Option<&mut [SaSint]>,
11290    fs: SaSint,
11291    freq: Option<&mut [SaSint]>,
11292) -> SaSint {
11293    if ctx.threads <= 0 || ctx.buckets.len() != 8 * ALPHABET_SIZE {
11294        return -2;
11295    }
11296
11297    let mut empty_thread_state = [];
11298    let thread_state = if ctx.threads > 1 {
11299        match ctx.thread_state.as_deref_mut() {
11300            Some(thread_state) if thread_state.len() >= ctx.threads as usize => thread_state,
11301            None => return -2,
11302            Some(_) => return -2,
11303        }
11304    } else {
11305        &mut empty_thread_state
11306    };
11307
11308    libsais_main_8u(
11309        t,
11310        sa,
11311        &mut ctx.buckets,
11312        flags,
11313        r,
11314        i,
11315        fs,
11316        freq,
11317        ctx.threads as SaSint,
11318        thread_state,
11319    )
11320}
11321
11322#[cfg(feature = "upstream-c")]
11323unsafe extern "C" {
11324    fn probe_public_libsais_freq(
11325        t: *const u8,
11326        sa: *mut SaSint,
11327        n: SaSint,
11328        fs: SaSint,
11329        freq: *mut SaSint,
11330    ) -> SaSint;
11331
11332    fn probe_public_libsais_omp_freq(
11333        t: *const u8,
11334        sa: *mut SaSint,
11335        n: SaSint,
11336        fs: SaSint,
11337        freq: *mut SaSint,
11338        threads: SaSint,
11339    ) -> SaSint;
11340}
11341
11342/// Wrapper around the bundled upstream C `libsais` implementation.
11343///
11344/// Available only with the `upstream-c` feature. Provides the same semantics as the Rust [`libsais`] function but defers all work to the C library; intended for the differential test suite and benchmarks.
11345///
11346/// - `t` (`[0..n-1]`): the input string.
11347/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11348/// - `fs`: extra space available at the end of `sa`.
11349/// - `freq` (`[0..255]`): optional output symbol frequency table.
11350///
11351/// Returns 0 on success, -1 or -2 on error.
11352#[cfg(feature = "upstream-c")]
11353pub fn libsais_upstream_c(
11354    t: &[u8],
11355    sa: &mut [SaSint],
11356    fs: SaSint,
11357    freq: Option<&mut [SaSint]>,
11358) -> SaSint {
11359    if fs < 0
11360        || t.len() > SaSint::MAX as usize
11361        || sa.len()
11362            < t.len()
11363                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11364    {
11365        return -1;
11366    }
11367    if let Some(freq) = freq.as_ref() {
11368        if freq.len() < ALPHABET_SIZE {
11369            return -1;
11370        }
11371    }
11372
11373    let n = t.len() as SaSint;
11374    let freq_ptr = freq.map_or(std::ptr::null_mut(), |freq| freq.as_mut_ptr());
11375    unsafe { probe_public_libsais_freq(t.as_ptr(), sa.as_mut_ptr(), n, fs, freq_ptr) }
11376}
11377
11378/// Wrapper around the bundled upstream C `libsais_omp` implementation.
11379///
11380/// Available only with the `upstream-c` feature. Same semantics as the Rust [`libsais_omp`] function but defers all work to the C library; intended for the differential test suite and benchmarks.
11381///
11382/// - `t` (`[0..n-1]`): the input string.
11383/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11384/// - `fs`: extra space available at the end of `sa`.
11385/// - `freq` (`[0..255]`): optional output symbol frequency table.
11386/// - `threads`: number of worker threads (can be 0 for the implementation default).
11387///
11388/// Returns 0 on success, -1 or -2 on error.
11389#[cfg(feature = "upstream-c")]
11390pub fn libsais_upstream_c_omp(
11391    t: &[u8],
11392    sa: &mut [SaSint],
11393    fs: SaSint,
11394    freq: Option<&mut [SaSint]>,
11395    threads: SaSint,
11396) -> SaSint {
11397    if threads < 0 {
11398        return -1;
11399    }
11400    if fs < 0
11401        || t.len() > SaSint::MAX as usize
11402        || sa.len()
11403            < t.len()
11404                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11405    {
11406        return -1;
11407    }
11408    if let Some(freq) = freq.as_ref() {
11409        if freq.len() < ALPHABET_SIZE {
11410            return -1;
11411        }
11412    }
11413
11414    let n = t.len() as SaSint;
11415    let freq_ptr = freq.map_or(std::ptr::null_mut(), |freq| freq.as_mut_ptr());
11416    unsafe {
11417        probe_public_libsais_omp_freq(t.as_ptr(), sa.as_mut_ptr(), n, fs, freq_ptr, threads.max(1))
11418    }
11419}
11420
11421/// Constructs the suffix array of a given string.
11422///
11423/// - `t` (`[0..n-1]`): the input string.
11424/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11425/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11426/// - `freq` (`[0..255]`): optional output symbol frequency table.
11427///
11428/// Returns 0 on success, -1 or -2 on error.
11429pub fn libsais(t: &[u8], sa: &mut [SaSint], fs: SaSint, freq: Option<&mut [SaSint]>) -> SaSint {
11430    if fs < 0
11431        || sa.len()
11432            < t.len()
11433                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11434    {
11435        return -1;
11436    }
11437    if let Some(freq) = freq.as_ref() {
11438        if freq.len() < ALPHABET_SIZE {
11439            return -1;
11440        }
11441    }
11442
11443    let n = t.len();
11444    if n <= 1 {
11445        if let Some(freq) = freq {
11446            freq[..ALPHABET_SIZE].fill(0);
11447            if n == 1 {
11448                freq[t[0] as usize] += 1;
11449            }
11450        }
11451        if n == 1 {
11452            sa[0] = 0;
11453        }
11454        return 0;
11455    }
11456
11457    libsais_main(t, sa, LIBSAIS_FLAGS_NONE, 0, None, fs, freq, 1)
11458}
11459
11460/// Constructs the generalized suffix array (GSA) of a given string set.
11461///
11462/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
11463/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11464/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11465/// - `freq` (`[0..255]`): optional output symbol frequency table.
11466///
11467/// Returns 0 on success, -1 or -2 on error.
11468pub fn libsais_gsa(t: &[u8], sa: &mut [SaSint], fs: SaSint, freq: Option<&mut [SaSint]>) -> SaSint {
11469    if fs < 0
11470        || sa.len()
11471            < t.len()
11472                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11473    {
11474        return -1;
11475    }
11476    if let Some(freq) = freq.as_ref() {
11477        if freq.len() < ALPHABET_SIZE {
11478            return -1;
11479        }
11480    }
11481
11482    let n = t.len();
11483    if n > 0 && t[n - 1] != 0 {
11484        return -1;
11485    }
11486
11487    if n <= 1 {
11488        if let Some(freq) = freq {
11489            freq[..ALPHABET_SIZE].fill(0);
11490            if n == 1 {
11491                freq[t[0] as usize] += 1;
11492            }
11493        }
11494        if n == 1 {
11495            sa[0] = 0;
11496        }
11497        return 0;
11498    }
11499
11500    libsais_main(t, sa, LIBSAIS_FLAGS_GSA, 0, None, fs, freq, 1)
11501}
11502
11503/// Constructs the suffix array of a given integer array.
11504///
11505/// During construction the input array is modified, but restored at the end if no error occurred.
11506///
11507/// - `t` (`[0..n-1]`): the input integer array.
11508/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11509/// - `k`: the alphabet size of the input integer array.
11510/// - `fs`: extra space available at the end of `sa` (can be 0, but 4k or better 6k is recommended for optimal performance).
11511///
11512/// Returns 0 on success, -1 or -2 on error.
11513pub fn libsais_int(t: &mut [SaSint], sa: &mut [SaSint], k: SaSint, fs: SaSint) -> SaSint {
11514    if fs < 0
11515        || sa.len()
11516            < t.len()
11517                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11518    {
11519        return -1;
11520    }
11521
11522    if t.len() <= 1 {
11523        if t.len() == 1 {
11524            sa[0] = 0;
11525        }
11526        return 0;
11527    }
11528
11529    libsais_main_int(t, sa, k, fs, 1)
11530}
11531
11532/// Constructs the suffix array of a given string using a libsais context.
11533///
11534/// - `ctx`: the libsais context.
11535/// - `t` (`[0..n-1]`): the input string.
11536/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11537/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11538/// - `freq` (`[0..255]`): optional output symbol frequency table.
11539///
11540/// Returns 0 on success, -1 or -2 on error.
11541pub fn libsais_ctx(
11542    ctx: &mut Context,
11543    t: &[u8],
11544    sa: &mut [SaSint],
11545    fs: SaSint,
11546    freq: Option<&mut [SaSint]>,
11547) -> SaSint {
11548    if fs < 0
11549        || sa.len()
11550            < t.len()
11551                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11552    {
11553        return -1;
11554    }
11555    if let Some(freq) = freq.as_ref() {
11556        if freq.len() < ALPHABET_SIZE {
11557            return -1;
11558        }
11559    }
11560
11561    let n = t.len();
11562    if n <= 1 {
11563        if let Some(freq) = freq {
11564            freq[..ALPHABET_SIZE].fill(0);
11565            if n == 1 {
11566                freq[t[0] as usize] += 1;
11567            }
11568        }
11569        if n == 1 {
11570            sa[0] = 0;
11571        }
11572        return 0;
11573    }
11574
11575    libsais_main_ctx(ctx, t, sa, LIBSAIS_FLAGS_NONE, 0, None, fs, freq)
11576}
11577
11578/// Constructs the generalized suffix array (GSA) of a given string set using a libsais context.
11579///
11580/// - `ctx`: the libsais context.
11581/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
11582/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11583/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11584/// - `freq` (`[0..255]`): optional output symbol frequency table.
11585///
11586/// Returns 0 on success, -1 or -2 on error.
11587pub fn libsais_gsa_ctx(
11588    ctx: &mut Context,
11589    t: &[u8],
11590    sa: &mut [SaSint],
11591    fs: SaSint,
11592    freq: Option<&mut [SaSint]>,
11593) -> SaSint {
11594    if fs < 0
11595        || sa.len()
11596            < t.len()
11597                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11598    {
11599        return -1;
11600    }
11601    if let Some(freq) = freq.as_ref() {
11602        if freq.len() < ALPHABET_SIZE {
11603            return -1;
11604        }
11605    }
11606
11607    let n = t.len();
11608    if n > 0 && t[n - 1] != 0 {
11609        return -1;
11610    }
11611
11612    if n <= 1 {
11613        if let Some(freq) = freq {
11614            freq[..ALPHABET_SIZE].fill(0);
11615            if n == 1 {
11616                freq[t[0] as usize] += 1;
11617            }
11618        }
11619        if n == 1 {
11620            sa[0] = 0;
11621        }
11622        return 0;
11623    }
11624
11625    libsais_main_ctx(ctx, t, sa, LIBSAIS_FLAGS_GSA, 0, None, fs, freq)
11626}
11627
11628/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string.
11629///
11630/// - `t` (`[0..n-1]`): the input string.
11631/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11632/// - `a` (`[0..n-1+fs]`): the temporary array.
11633/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11634/// - `freq` (`[0..255]`): optional output symbol frequency table.
11635///
11636/// Returns the primary index on success, -1 or -2 on error.
11637pub fn libsais_bwt(
11638    t: &[u8],
11639    u: &mut [u8],
11640    a: &mut [SaSint],
11641    fs: SaSint,
11642    freq: Option<&mut [SaSint]>,
11643) -> SaSint {
11644    if fs < 0
11645        || u.len() < t.len()
11646        || a.len()
11647            < t.len()
11648                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11649    {
11650        return -1;
11651    }
11652    if let Some(freq) = freq.as_ref() {
11653        if freq.len() < ALPHABET_SIZE {
11654            return -1;
11655        }
11656    }
11657
11658    let n = t.len();
11659    if n <= 1 {
11660        if let Some(freq) = freq {
11661            freq[..ALPHABET_SIZE].fill(0);
11662            if n == 1 {
11663                u[0] = t[0];
11664                freq[t[0] as usize] += 1;
11665            }
11666        } else if n == 1 {
11667            u[0] = t[0];
11668        }
11669        return n as SaSint;
11670    }
11671
11672    let mut index = libsais_main(t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq, 1);
11673    if index >= 0 {
11674        index += 1;
11675        let split = usize::try_from(index).expect("index must be non-negative");
11676        u[0] = t[n - 1];
11677        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], index - 1, 1);
11678        bwt_copy_8u_omp(
11679            &mut u[split..n],
11680            &a[split..n],
11681            SaSint::try_from(n - split).expect("fits"),
11682            1,
11683        );
11684    }
11685    index
11686}
11687
11688/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string with auxiliary indexes.
11689///
11690/// - `t` (`[0..n-1]`): the input string.
11691/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11692/// - `a` (`[0..n-1+fs]`): the temporary array.
11693/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11694/// - `freq` (`[0..255]`): optional output symbol frequency table.
11695/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
11696/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
11697///
11698/// Returns 0 on success, -1 or -2 on error.
11699pub fn libsais_bwt_aux(
11700    t: &[u8],
11701    u: &mut [u8],
11702    a: &mut [SaSint],
11703    fs: SaSint,
11704    freq: Option<&mut [SaSint]>,
11705    r: SaSint,
11706    i: &mut [SaSint],
11707) -> SaSint {
11708    let n = t.len();
11709    if fs < 0
11710        || r < 2
11711        || (r & (r - 1)) != 0
11712        || u.len() < n
11713        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11714        || freq.as_ref().is_some_and(|freq| freq.len() < ALPHABET_SIZE)
11715    {
11716        return -1;
11717    }
11718    let sample_count = if n == 0 {
11719        1
11720    } else {
11721        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
11722            .expect("sample count must be non-negative")
11723            + 1
11724    };
11725    if i.len() < sample_count {
11726        return -1;
11727    }
11728
11729    if n <= 1 {
11730        if let Some(freq) = freq {
11731            freq[..ALPHABET_SIZE].fill(0);
11732            if n == 1 {
11733                u[0] = t[0];
11734                freq[t[0] as usize] += 1;
11735            }
11736        } else if n == 1 {
11737            u[0] = t[0];
11738        }
11739        i[0] = n as SaSint;
11740        return 0;
11741    }
11742
11743    let index = libsais_main(t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq, 1);
11744    if index == 0 {
11745        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
11746        u[0] = t[n - 1];
11747        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], i[0] - 1, 1);
11748        bwt_copy_8u_omp(
11749            &mut u[split..n],
11750            &a[split..n],
11751            SaSint::try_from(n - split).expect("fits"),
11752            1,
11753        );
11754    }
11755    index
11756}
11757
11758/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string using a libsais context.
11759///
11760/// - `ctx`: the libsais context.
11761/// - `t` (`[0..n-1]`): the input string.
11762/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11763/// - `a` (`[0..n-1+fs]`): the temporary array.
11764/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11765/// - `freq` (`[0..255]`): optional output symbol frequency table.
11766///
11767/// Returns the primary index on success, -1 or -2 on error.
11768pub fn libsais_bwt_ctx(
11769    ctx: &mut Context,
11770    t: &[u8],
11771    u: &mut [u8],
11772    a: &mut [SaSint],
11773    fs: SaSint,
11774    freq: Option<&mut [SaSint]>,
11775) -> SaSint {
11776    if fs < 0
11777        || u.len() < t.len()
11778        || a.len()
11779            < t.len()
11780                .saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11781    {
11782        return -1;
11783    }
11784    if let Some(freq) = freq.as_ref() {
11785        if freq.len() < ALPHABET_SIZE {
11786            return -1;
11787        }
11788    }
11789
11790    let n = t.len();
11791    if n <= 1 {
11792        if let Some(freq) = freq {
11793            freq[..ALPHABET_SIZE].fill(0);
11794            if n == 1 {
11795                u[0] = t[0];
11796                freq[t[0] as usize] += 1;
11797            }
11798        } else if n == 1 {
11799            u[0] = t[0];
11800        }
11801        return n as SaSint;
11802    }
11803
11804    let mut index = libsais_main_ctx(ctx, t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq);
11805    if index >= 0 {
11806        index += 1;
11807        let split = usize::try_from(index).expect("index must be non-negative");
11808        u[0] = t[n - 1];
11809        bwt_copy_8u_omp(
11810            &mut u[1..split],
11811            &a[..split - 1],
11812            index - 1,
11813            ctx.threads as SaSint,
11814        );
11815        bwt_copy_8u_omp(
11816            &mut u[split..n],
11817            &a[split..n],
11818            SaSint::try_from(n - split).expect("fits"),
11819            ctx.threads as SaSint,
11820        );
11821    }
11822    index
11823}
11824
11825/// Constructs the BWT of a given string with auxiliary indexes using a libsais context.
11826///
11827/// - `ctx`: the libsais context.
11828/// - `t` (`[0..n-1]`): the input string.
11829/// - `u` (`[0..n-1]`): the output string (can alias `t`).
11830/// - `a` (`[0..n-1+fs]`): the temporary array.
11831/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
11832/// - `freq` (`[0..255]`): optional output symbol frequency table.
11833/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
11834/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
11835///
11836/// Returns 0 on success, -1 or -2 on error.
11837pub fn libsais_bwt_aux_ctx(
11838    ctx: &mut Context,
11839    t: &[u8],
11840    u: &mut [u8],
11841    a: &mut [SaSint],
11842    fs: SaSint,
11843    freq: Option<&mut [SaSint]>,
11844    r: SaSint,
11845    i: &mut [SaSint],
11846) -> SaSint {
11847    let n = t.len();
11848    if fs < 0
11849        || r < 2
11850        || (r & (r - 1)) != 0
11851        || u.len() < n
11852        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
11853    {
11854        return -1;
11855    }
11856    if let Some(freq) = freq.as_ref() {
11857        if freq.len() < ALPHABET_SIZE {
11858            return -1;
11859        }
11860    }
11861    let sample_count = if n == 0 {
11862        1
11863    } else {
11864        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
11865            .expect("sample count must be non-negative")
11866            + 1
11867    };
11868    if i.len() < sample_count {
11869        return -1;
11870    }
11871
11872    if n <= 1 {
11873        if let Some(freq) = freq {
11874            freq[..ALPHABET_SIZE].fill(0);
11875            if n == 1 {
11876                u[0] = t[0];
11877                freq[t[0] as usize] += 1;
11878            }
11879        } else if n == 1 {
11880            u[0] = t[0];
11881        }
11882        i[0] = n as SaSint;
11883        return 0;
11884    }
11885
11886    let index = libsais_main_ctx(ctx, t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq);
11887    if index == 0 {
11888        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
11889        u[0] = t[n - 1];
11890        bwt_copy_8u_omp(
11891            &mut u[1..split],
11892            &a[..split - 1],
11893            i[0] - 1,
11894            ctx.threads as SaSint,
11895        );
11896        bwt_copy_8u_omp(
11897            &mut u[split..n],
11898            &a[split..n],
11899            SaSint::try_from(n - split).expect("fits"),
11900            ctx.threads as SaSint,
11901        );
11902    }
11903    index
11904}
11905
11906/// Creates the libsais context for parallel operations using OpenMP-style threading.
11907///
11908/// In multi-threaded environments, use one context per thread for parallel executions.
11909///
11910/// - `threads`: number of worker threads (can be 0 for the implementation default).
11911///
11912/// Returns the context, or `None` on allocation failure.
11913pub fn create_ctx_omp(threads: SaSint) -> Option<Context> {
11914    if threads < 0 {
11915        return None;
11916    }
11917
11918    create_ctx_main(normalize_omp_threads(threads))
11919}
11920
11921/// Constructs the suffix array of a given string in parallel using OpenMP-style threading.
11922///
11923/// - `t` (`[0..n-1]`): the input string.
11924/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11925/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11926/// - `freq` (`[0..255]`): optional output symbol frequency table.
11927/// - `threads`: number of worker threads (can be 0 for the implementation default).
11928///
11929/// Returns 0 on success, -1 or -2 on error.
11930pub fn libsais_omp(
11931    t: &[u8],
11932    sa: &mut [SaSint],
11933    fs: SaSint,
11934    freq: Option<&mut [SaSint]>,
11935    threads: SaSint,
11936) -> SaSint {
11937    if threads < 0 {
11938        return -1;
11939    }
11940    if let Some(freq) = freq.as_ref() {
11941        if freq.len() < ALPHABET_SIZE {
11942            return -1;
11943        }
11944    }
11945    let n = t.len();
11946    if n <= 1 {
11947        if let Some(freq) = freq {
11948            freq[..ALPHABET_SIZE].fill(0);
11949            if n == 1 {
11950                sa[0] = 0;
11951                freq[t[0] as usize] += 1;
11952            }
11953        } else if n == 1 {
11954            sa[0] = 0;
11955        }
11956        return 0;
11957    }
11958
11959    libsais_main(
11960        t,
11961        sa,
11962        LIBSAIS_FLAGS_NONE,
11963        0,
11964        None,
11965        fs,
11966        freq,
11967        normalize_omp_threads(threads),
11968    )
11969}
11970
11971/// Constructs the generalized suffix array (GSA) of a given string set in parallel using OpenMP-style threading.
11972///
11973/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
11974/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
11975/// - `fs`: extra space available at the end of `sa` (0 should be enough for most cases).
11976/// - `freq` (`[0..255]`): optional output symbol frequency table.
11977/// - `threads`: number of worker threads (can be 0 for the implementation default).
11978///
11979/// Returns 0 on success, -1 or -2 on error.
11980pub fn libsais_gsa_omp(
11981    t: &[u8],
11982    sa: &mut [SaSint],
11983    fs: SaSint,
11984    freq: Option<&mut [SaSint]>,
11985    threads: SaSint,
11986) -> SaSint {
11987    if threads < 0 || t.last().copied().unwrap_or(0) != 0 {
11988        return -1;
11989    }
11990    if let Some(freq) = freq.as_ref() {
11991        if freq.len() < ALPHABET_SIZE {
11992            return -1;
11993        }
11994    }
11995    let n = t.len();
11996    if n <= 1 {
11997        if let Some(freq) = freq {
11998            freq[..ALPHABET_SIZE].fill(0);
11999            if n == 1 {
12000                sa[0] = 0;
12001                freq[t[0] as usize] += 1;
12002            }
12003        } else if n == 1 {
12004            sa[0] = 0;
12005        }
12006        return 0;
12007    }
12008
12009    libsais_main(
12010        t,
12011        sa,
12012        LIBSAIS_FLAGS_GSA,
12013        0,
12014        None,
12015        fs,
12016        freq,
12017        normalize_omp_threads(threads),
12018    )
12019}
12020
12021/// Constructs the suffix array of a given integer array in parallel using OpenMP-style threading.
12022///
12023/// During construction the input array is modified, but restored at the end if no error occurred.
12024///
12025/// - `t` (`[0..n-1]`): the input integer array.
12026/// - `sa` (`[0..n-1+fs]`): the output array of suffixes.
12027/// - `k`: the alphabet size of the input integer array.
12028/// - `fs`: extra space available at the end of `sa` (can be 0, but 4k or better 6k is recommended for optimal performance).
12029/// - `threads`: number of worker threads (can be 0 for the implementation default).
12030///
12031/// Returns 0 on success, -1 or -2 on error.
12032pub fn libsais_int_omp(
12033    t: &mut [SaSint],
12034    sa: &mut [SaSint],
12035    k: SaSint,
12036    fs: SaSint,
12037    threads: SaSint,
12038) -> SaSint {
12039    if threads < 0 {
12040        return -1;
12041    }
12042    if t.len() <= 1 {
12043        if t.len() == 1 {
12044            sa[0] = 0;
12045        }
12046        return 0;
12047    }
12048
12049    libsais_main_int(t, sa, k, fs, normalize_omp_threads(threads))
12050}
12051
12052/// Constructs the Burrows-Wheeler transformed string (BWT) of a given string in parallel using OpenMP-style threading.
12053///
12054/// - `t` (`[0..n-1]`): the input string.
12055/// - `u` (`[0..n-1]`): the output string (can alias `t`).
12056/// - `a` (`[0..n-1+fs]`): the temporary array.
12057/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
12058/// - `freq` (`[0..255]`): optional output symbol frequency table.
12059/// - `threads`: number of worker threads (can be 0 for the implementation default).
12060///
12061/// Returns the primary index on success, -1 or -2 on error.
12062pub fn libsais_bwt_omp(
12063    t: &[u8],
12064    u: &mut [u8],
12065    a: &mut [SaSint],
12066    fs: SaSint,
12067    freq: Option<&mut [SaSint]>,
12068    threads: SaSint,
12069) -> SaSint {
12070    let n = t.len();
12071    if threads < 0
12072        || fs < 0
12073        || u.len() < n
12074        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12075        || freq.as_ref().is_some_and(|freq| freq.len() < ALPHABET_SIZE)
12076    {
12077        return -1;
12078    }
12079
12080    if n <= 1 {
12081        if let Some(freq) = freq {
12082            freq[..ALPHABET_SIZE].fill(0);
12083            if n == 1 {
12084                u[0] = t[0];
12085                freq[t[0] as usize] += 1;
12086            }
12087        } else if n == 1 {
12088            u[0] = t[0];
12089        }
12090        return n as SaSint;
12091    }
12092
12093    let threads = if threads > 0 { threads } else { 1 };
12094    let mut index = libsais_main(t, a, LIBSAIS_FLAGS_BWT, 0, None, fs, freq, threads);
12095    if index >= 0 {
12096        index += 1;
12097        let index_usize = usize::try_from(index).expect("index must be non-negative");
12098        u[0] = t[n - 1];
12099        bwt_copy_8u_omp(
12100            &mut u[1..index_usize],
12101            &a[..index_usize - 1],
12102            index - 1,
12103            threads,
12104        );
12105        bwt_copy_8u_omp(
12106            &mut u[index_usize..n],
12107            &a[index_usize..n],
12108            SaSint::try_from(n - index_usize).expect("fits"),
12109            threads,
12110        );
12111    }
12112    index
12113}
12114
12115/// Constructs the BWT of a given string with auxiliary indexes in parallel using OpenMP-style threading.
12116///
12117/// - `t` (`[0..n-1]`): the input string.
12118/// - `u` (`[0..n-1]`): the output string (can alias `t`).
12119/// - `a` (`[0..n-1+fs]`): the temporary array.
12120/// - `fs`: extra space available at the end of `a` (0 should be enough for most cases).
12121/// - `freq` (`[0..255]`): optional output symbol frequency table.
12122/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
12123/// - `i` (`[0..(n-1)/r]`): output auxiliary indexes.
12124/// - `threads`: number of worker threads (can be 0 for the implementation default).
12125///
12126/// Returns 0 on success, -1 or -2 on error.
12127pub fn libsais_bwt_aux_omp(
12128    t: &[u8],
12129    u: &mut [u8],
12130    a: &mut [SaSint],
12131    fs: SaSint,
12132    freq: Option<&mut [SaSint]>,
12133    r: SaSint,
12134    i: &mut [SaSint],
12135    threads: SaSint,
12136) -> SaSint {
12137    let n = t.len();
12138    if threads < 0
12139        || fs < 0
12140        || r < 2
12141        || (r & (r - 1)) != 0
12142        || u.len() < n
12143        || a.len() < n.saturating_add(usize::try_from(fs).unwrap_or(usize::MAX))
12144    {
12145        return -1;
12146    }
12147    if let Some(freq) = freq.as_ref() {
12148        if freq.len() < ALPHABET_SIZE {
12149            return -1;
12150        }
12151    }
12152    let sample_count = if n == 0 {
12153        1
12154    } else {
12155        usize::try_from((SaSint::try_from(n).expect("input length must fit SaSint") - 1) / r)
12156            .expect("sample count must be non-negative")
12157            + 1
12158    };
12159    if i.len() < sample_count {
12160        return -1;
12161    }
12162    if n <= 1 {
12163        if let Some(freq) = freq {
12164            freq[..ALPHABET_SIZE].fill(0);
12165            if n == 1 {
12166                u[0] = t[0];
12167                freq[t[0] as usize] += 1;
12168            }
12169        } else if n == 1 {
12170            u[0] = t[0];
12171        }
12172        i[0] = n as SaSint;
12173        return 0;
12174    }
12175
12176    let threads = normalize_omp_threads(threads);
12177    let index = libsais_main(t, a, LIBSAIS_FLAGS_BWT, r, Some(i), fs, freq, threads);
12178    if index == 0 {
12179        let split = usize::try_from(i[0]).expect("primary index must be non-negative");
12180        u[0] = t[n - 1];
12181        bwt_copy_8u_omp(&mut u[1..split], &a[..split - 1], i[0] - 1, threads);
12182        bwt_copy_8u_omp(
12183            &mut u[split..n],
12184            &a[split..n],
12185            SaSint::try_from(n - split).expect("fits"),
12186            threads,
12187        );
12188    }
12189    index
12190}
12191
12192/// Internal helper: compute phi.
12193#[doc(hidden)]
12194pub fn compute_phi(
12195    sa: &[SaSint],
12196    plcp: &mut [SaSint],
12197    n: SaSint,
12198    omp_block_start: FastSint,
12199    omp_block_size: FastSint,
12200) {
12201    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12202    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12203    let end = start + size;
12204    let n_usize = usize::try_from(n).expect("n must be non-negative");
12205    let mut i = start;
12206    let mut k = if omp_block_start > 0 {
12207        sa[start - 1]
12208    } else {
12209        n
12210    };
12211
12212    let fast_end = omp_block_start + omp_block_size - 64 - 3;
12213    while (i as FastSint) < fast_end {
12214        plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")] = k;
12215        k = sa[i];
12216        plcp[usize::try_from(sa[i + 1]).expect("suffix index must be non-negative")] = k;
12217        k = sa[i + 1];
12218        plcp[usize::try_from(sa[i + 2]).expect("suffix index must be non-negative")] = k;
12219        k = sa[i + 2];
12220        plcp[usize::try_from(sa[i + 3]).expect("suffix index must be non-negative")] = k;
12221        k = sa[i + 3];
12222        i += 4;
12223    }
12224
12225    while i < end.min(n_usize) {
12226        plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")] = k;
12227        k = sa[i];
12228        i += 1;
12229    }
12230}
12231
12232/// Internal helper: compute phi (OpenMP variant).
12233#[doc(hidden)]
12234pub fn compute_phi_omp(sa: &[SaSint], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12235    if threads == 1 || n < 65_536 {
12236        compute_phi(sa, plcp, n, 0, n as FastSint);
12237        return;
12238    }
12239
12240    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12241    let block_stride = ((n as FastSint) / (threads as FastSint)) & !15;
12242    let plcp_addr = plcp.as_mut_ptr() as usize;
12243    let n_usize = usize::try_from(n).expect("n must be non-negative");
12244
12245    run_rayon_with_threads(threads_usize, || {
12246        (0..threads_usize).into_par_iter().for_each(|thread| {
12247            let block_start = thread as FastSint * block_stride;
12248            let block_size = if thread + 1 < threads_usize {
12249                block_stride
12250            } else {
12251                n as FastSint - block_start
12252            };
12253            let start = usize::try_from(block_start).expect("omp_block_start must be non-negative");
12254            let size = usize::try_from(block_size).expect("omp_block_size must be non-negative");
12255            let end = start + size;
12256            let mut i = start;
12257            let mut k = if block_start > 0 { sa[start - 1] } else { n };
12258            let plcp_ptr = plcp_addr as *mut SaSint;
12259
12260            let fast_end = block_start + block_size - 64 - 3;
12261            while (i as FastSint) < fast_end {
12262                unsafe {
12263                    // SA is a suffix-array permutation, so each thread writes a disjoint PLCP slot.
12264                    *plcp_ptr
12265                        .add(usize::try_from(sa[i]).expect("suffix index must be non-negative")) =
12266                        k;
12267                    k = sa[i];
12268                    *plcp_ptr.add(
12269                        usize::try_from(sa[i + 1]).expect("suffix index must be non-negative"),
12270                    ) = k;
12271                    k = sa[i + 1];
12272                    *plcp_ptr.add(
12273                        usize::try_from(sa[i + 2]).expect("suffix index must be non-negative"),
12274                    ) = k;
12275                    k = sa[i + 2];
12276                    *plcp_ptr.add(
12277                        usize::try_from(sa[i + 3]).expect("suffix index must be non-negative"),
12278                    ) = k;
12279                    k = sa[i + 3];
12280                }
12281                i += 4;
12282            }
12283
12284            while i < end.min(n_usize) {
12285                unsafe {
12286                    // SA is a suffix-array permutation, so each thread writes a disjoint PLCP slot.
12287                    *plcp_ptr
12288                        .add(usize::try_from(sa[i]).expect("suffix index must be non-negative")) =
12289                        k;
12290                }
12291                k = sa[i];
12292                i += 1;
12293            }
12294        });
12295    });
12296}
12297
12298/// Internal helper: compute plcp.
12299#[doc(hidden)]
12300pub fn compute_plcp(
12301    t: &[u8],
12302    plcp: &mut [SaSint],
12303    n: FastSint,
12304    omp_block_start: FastSint,
12305    omp_block_size: FastSint,
12306) {
12307    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12308    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12309    let end = start + size;
12310    let n_usize = usize::try_from(n).expect("n must be non-negative");
12311    let mut l = 0usize;
12312
12313    for i in start..end.min(n_usize) {
12314        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12315        let m = n_usize - i.max(k);
12316        while l < m && t[i + l] == t[k + l] {
12317            l += 1;
12318        }
12319        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12320        l = l.saturating_sub(1);
12321    }
12322}
12323
12324/// Internal helper: compute plcp (OpenMP variant).
12325#[doc(hidden)]
12326pub fn compute_plcp_omp(t: &[u8], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12327    if threads == 1 || n < 65_536 {
12328        compute_plcp(t, plcp, n as FastSint, 0, n as FastSint);
12329        return;
12330    }
12331
12332    let n_usize = usize::try_from(n).expect("n must be non-negative");
12333    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12334    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12335    run_rayon_with_threads(threads_usize, || {
12336        plcp[..n_usize]
12337            .par_chunks_mut(chunk_size)
12338            .enumerate()
12339            .for_each(|(chunk_index, chunk)| {
12340                let start = chunk_index * chunk_size;
12341                let mut l = 0usize;
12342                for (offset, value) in chunk.iter_mut().enumerate() {
12343                    let i = start + offset;
12344                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12345                    let m = n_usize - i.max(k);
12346                    while l < m && t[i + l] == t[k + l] {
12347                        l += 1;
12348                    }
12349                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12350                    l = l.saturating_sub(1);
12351                }
12352            });
12353    });
12354}
12355
12356/// Internal helper: compute plcp gsa.
12357#[doc(hidden)]
12358pub fn compute_plcp_gsa(
12359    t: &[u8],
12360    plcp: &mut [SaSint],
12361    omp_block_start: FastSint,
12362    omp_block_size: FastSint,
12363) {
12364    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12365    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12366    let end = start + size;
12367    let mut l = 0usize;
12368
12369    for i in start..end.min(t.len()) {
12370        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12371        while t[i + l] > 0 && t[i + l] == t[k + l] {
12372            l += 1;
12373        }
12374        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12375        l = l.saturating_sub(1);
12376    }
12377}
12378
12379/// Internal helper: compute plcp gsa (OpenMP variant).
12380#[doc(hidden)]
12381pub fn compute_plcp_gsa_omp(t: &[u8], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12382    if threads == 1 || n < 65_536 {
12383        compute_plcp_gsa(t, plcp, 0, n as FastSint);
12384        return;
12385    }
12386
12387    let n_usize = usize::try_from(n).expect("n must be non-negative");
12388    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12389    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12390    run_rayon_with_threads(threads_usize, || {
12391        plcp[..n_usize]
12392            .par_chunks_mut(chunk_size)
12393            .enumerate()
12394            .for_each(|(chunk_index, chunk)| {
12395                let start = chunk_index * chunk_size;
12396                let mut l = 0usize;
12397                for (offset, value) in chunk.iter_mut().enumerate() {
12398                    let i = start + offset;
12399                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12400                    while t[i + l] > 0 && t[i + l] == t[k + l] {
12401                        l += 1;
12402                    }
12403                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12404                    l = l.saturating_sub(1);
12405                }
12406            });
12407    });
12408}
12409
12410/// Internal helper: compute plcp int.
12411#[doc(hidden)]
12412pub fn compute_plcp_int(
12413    t: &[SaSint],
12414    plcp: &mut [SaSint],
12415    n: FastSint,
12416    omp_block_start: FastSint,
12417    omp_block_size: FastSint,
12418) {
12419    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12420    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12421    let end = start + size;
12422    let n_usize = usize::try_from(n).expect("n must be non-negative");
12423    let mut l = 0usize;
12424
12425    for i in start..end.min(n_usize) {
12426        let k = usize::try_from(plcp[i]).expect("phi entry must be non-negative");
12427        let m = n_usize - i.max(k);
12428        while l < m && t[i + l] == t[k + l] {
12429            l += 1;
12430        }
12431        plcp[i] = SaSint::try_from(l).expect("LCP length must fit SaSint");
12432        l = l.saturating_sub(1);
12433    }
12434}
12435
12436/// Internal helper: compute plcp int (OpenMP variant).
12437#[doc(hidden)]
12438pub fn compute_plcp_int_omp(t: &[SaSint], plcp: &mut [SaSint], n: SaSint, threads: SaSint) {
12439    if threads == 1 || n < 65_536 {
12440        compute_plcp_int(t, plcp, n as FastSint, 0, n as FastSint);
12441        return;
12442    }
12443
12444    let n_usize = usize::try_from(n).expect("n must be non-negative");
12445    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12446    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12447    run_rayon_with_threads(threads_usize, || {
12448        plcp[..n_usize]
12449            .par_chunks_mut(chunk_size)
12450            .enumerate()
12451            .for_each(|(chunk_index, chunk)| {
12452                let start = chunk_index * chunk_size;
12453                let mut l = 0usize;
12454                for (offset, value) in chunk.iter_mut().enumerate() {
12455                    let i = start + offset;
12456                    let k = usize::try_from(*value).expect("phi entry must be non-negative");
12457                    let m = n_usize - i.max(k);
12458                    while l < m && t[i + l] == t[k + l] {
12459                        l += 1;
12460                    }
12461                    *value = SaSint::try_from(l).expect("LCP length must fit SaSint");
12462                    l = l.saturating_sub(1);
12463                }
12464            });
12465    });
12466}
12467
12468/// Internal helper: compute lcp.
12469#[doc(hidden)]
12470pub fn compute_lcp(
12471    plcp: &[SaSint],
12472    sa: &[SaSint],
12473    lcp: &mut [SaSint],
12474    omp_block_start: FastSint,
12475    omp_block_size: FastSint,
12476) {
12477    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12478    let size = usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12479    let end = start + size;
12480
12481    for i in start..end.min(sa.len()) {
12482        lcp[i] = plcp[usize::try_from(sa[i]).expect("suffix index must be non-negative")];
12483    }
12484}
12485
12486/// Internal helper: compute lcp (OpenMP variant).
12487#[doc(hidden)]
12488pub fn compute_lcp_omp(
12489    plcp: &[SaSint],
12490    sa: &[SaSint],
12491    lcp: &mut [SaSint],
12492    n: SaSint,
12493    threads: SaSint,
12494) {
12495    if threads == 1 || n < 65_536 {
12496        compute_lcp(plcp, sa, lcp, 0, n as FastSint);
12497        return;
12498    }
12499
12500    let n_usize = usize::try_from(n).expect("n must be non-negative");
12501    assert!(plcp.len() >= n_usize);
12502    assert!(sa.len() >= n_usize);
12503    assert!(lcp.len() >= n_usize);
12504    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
12505    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
12506    let plcp_ptr = plcp.as_ptr() as usize;
12507    let sa_ptr = sa.as_ptr() as usize;
12508    run_rayon_with_threads(threads_usize, || {
12509        lcp[..n_usize]
12510            .par_chunks_mut(chunk_size)
12511            .enumerate()
12512            .for_each(|(chunk_index, chunk)| {
12513                let start = chunk_index * chunk_size;
12514                let dst_ptr = chunk.as_mut_ptr();
12515                let sa_ptr = sa_ptr as *const SaSint;
12516                let plcp_ptr = plcp_ptr as *const SaSint;
12517                for offset in 0..chunk.len() {
12518                    let i = start + offset;
12519                    let suffix = unsafe { *sa_ptr.add(i) };
12520                    let suffix =
12521                        usize::try_from(suffix).expect("suffix index must be non-negative");
12522                    assert!(suffix < plcp.len());
12523                    unsafe {
12524                        *dst_ptr.add(offset) = *plcp_ptr.add(suffix);
12525                    }
12526                }
12527            });
12528    });
12529}
12530
12531/// Constructs the permuted longest common prefix array (PLCP) of a given string and suffix array.
12532///
12533/// - `t` (`[0..n-1]`): the input string.
12534/// - `sa` (`[0..n-1]`): the input suffix array.
12535/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12536///
12537/// Returns 0 on success, -1 on error.
12538pub fn libsais_plcp(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12539    if sa.len() != t.len() || plcp.len() != t.len() {
12540        return -1;
12541    }
12542    if t.len() <= 1 {
12543        if t.len() == 1 {
12544            plcp[0] = 0;
12545        }
12546        return 0;
12547    }
12548
12549    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12550    compute_phi_omp(sa, plcp, n, 1);
12551    compute_plcp_omp(t, plcp, n, 1);
12552    0
12553}
12554
12555/// Constructs the PLCP of a given string set and generalized suffix array (GSA).
12556///
12557/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
12558/// - `sa` (`[0..n-1]`): the input generalized suffix array.
12559/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12560///
12561/// Returns 0 on success, -1 on error.
12562pub fn libsais_plcp_gsa(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12563    if t.last().copied().unwrap_or(0) != 0 {
12564        return -1;
12565    }
12566    if sa.len() != t.len() || plcp.len() != t.len() {
12567        return -1;
12568    }
12569    if t.len() <= 1 {
12570        if t.len() == 1 {
12571            plcp[0] = 0;
12572        }
12573        return 0;
12574    }
12575
12576    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12577    compute_phi_omp(sa, plcp, n, 1);
12578    compute_plcp_gsa_omp(t, plcp, n, 1);
12579    0
12580}
12581
12582/// Constructs the PLCP of a given integer array and suffix array.
12583///
12584/// - `t` (`[0..n-1]`): the input integer array.
12585/// - `sa` (`[0..n-1]`): the input suffix array.
12586/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12587///
12588/// Returns 0 on success, -1 on error.
12589pub fn libsais_plcp_int(t: &[SaSint], sa: &[SaSint], plcp: &mut [SaSint]) -> SaSint {
12590    if sa.len() != t.len() || plcp.len() != t.len() {
12591        return -1;
12592    }
12593    if t.len() <= 1 {
12594        if t.len() == 1 {
12595            plcp[0] = 0;
12596        }
12597        return 0;
12598    }
12599
12600    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12601    compute_phi_omp(sa, plcp, n, 1);
12602    compute_plcp_int_omp(t, plcp, n, 1);
12603    0
12604}
12605
12606/// Constructs the longest common prefix array (LCP) from a PLCP and suffix array.
12607///
12608/// - `plcp` (`[0..n-1]`): the input permuted longest common prefix array.
12609/// - `sa` (`[0..n-1]`): the input suffix array or generalized suffix array (GSA).
12610/// - `lcp` (`[0..n-1]`): the output longest common prefix array (can alias `sa`).
12611///
12612/// Returns 0 on success, -1 on error.
12613pub fn libsais_lcp(plcp: &[SaSint], sa: &[SaSint], lcp: &mut [SaSint]) -> SaSint {
12614    if plcp.len() != sa.len() || lcp.len() != sa.len() {
12615        return -1;
12616    }
12617    if sa.len() <= 1 {
12618        if sa.len() == 1 {
12619            lcp[0] = plcp[usize::try_from(sa[0]).expect("suffix index must be non-negative")];
12620        }
12621        return 0;
12622    }
12623
12624    compute_lcp_omp(
12625        plcp,
12626        sa,
12627        lcp,
12628        SaSint::try_from(sa.len()).expect("suffix array length must fit SaSint"),
12629        1,
12630    );
12631    0
12632}
12633
12634/// Constructs the PLCP of a given string and suffix array in parallel using OpenMP-style threading.
12635///
12636/// - `t` (`[0..n-1]`): the input string.
12637/// - `sa` (`[0..n-1]`): the input suffix array.
12638/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12639/// - `threads`: number of worker threads (can be 0 for the implementation default).
12640///
12641/// Returns 0 on success, -1 on error.
12642pub fn libsais_plcp_omp(t: &[u8], sa: &[SaSint], plcp: &mut [SaSint], threads: SaSint) -> SaSint {
12643    if threads < 0 {
12644        return -1;
12645    }
12646    if sa.len() != t.len() || plcp.len() != t.len() {
12647        return -1;
12648    }
12649    if t.len() <= 1 {
12650        if t.len() == 1 {
12651            plcp[0] = 0;
12652        }
12653        return 0;
12654    }
12655
12656    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12657    let threads = normalize_omp_threads(threads);
12658    compute_phi_omp(sa, plcp, n, threads);
12659    compute_plcp_omp(t, plcp, n, threads);
12660    0
12661}
12662
12663/// Constructs the PLCP of a given string set and GSA in parallel using OpenMP-style threading.
12664///
12665/// - `t` (`[0..n-1]`): the input string set using 0 as separators (`t[n-1]` must be 0).
12666/// - `sa` (`[0..n-1]`): the input generalized suffix array.
12667/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12668/// - `threads`: number of worker threads (can be 0 for the implementation default).
12669///
12670/// Returns 0 on success, -1 on error.
12671pub fn libsais_plcp_gsa_omp(
12672    t: &[u8],
12673    sa: &[SaSint],
12674    plcp: &mut [SaSint],
12675    threads: SaSint,
12676) -> SaSint {
12677    if threads < 0 || t.last().copied().unwrap_or(0) != 0 {
12678        return -1;
12679    }
12680    if sa.len() != t.len() || plcp.len() != t.len() {
12681        return -1;
12682    }
12683    if t.len() <= 1 {
12684        if t.len() == 1 {
12685            plcp[0] = 0;
12686        }
12687        return 0;
12688    }
12689
12690    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12691    let threads = normalize_omp_threads(threads);
12692    compute_phi_omp(sa, plcp, n, threads);
12693    compute_plcp_gsa_omp(t, plcp, n, threads);
12694    0
12695}
12696
12697/// Constructs the PLCP of a given integer array and suffix array in parallel using OpenMP-style threading.
12698///
12699/// - `t` (`[0..n-1]`): the input integer array.
12700/// - `sa` (`[0..n-1]`): the input suffix array.
12701/// - `plcp` (`[0..n-1]`): the output permuted longest common prefix array.
12702/// - `threads`: number of worker threads (can be 0 for the implementation default).
12703///
12704/// Returns 0 on success, -1 on error.
12705pub fn libsais_plcp_int_omp(
12706    t: &[SaSint],
12707    sa: &[SaSint],
12708    plcp: &mut [SaSint],
12709    threads: SaSint,
12710) -> SaSint {
12711    if threads < 0 {
12712        return -1;
12713    }
12714    if sa.len() != t.len() || plcp.len() != t.len() {
12715        return -1;
12716    }
12717    if t.len() <= 1 {
12718        if t.len() == 1 {
12719            plcp[0] = 0;
12720        }
12721        return 0;
12722    }
12723
12724    let n = SaSint::try_from(t.len()).expect("input length must fit SaSint");
12725    let threads = normalize_omp_threads(threads);
12726    compute_phi_omp(sa, plcp, n, threads);
12727    compute_plcp_int_omp(t, plcp, n, threads);
12728    0
12729}
12730
12731/// Constructs the LCP from a PLCP and suffix array in parallel using OpenMP-style threading.
12732///
12733/// - `plcp` (`[0..n-1]`): the input permuted longest common prefix array.
12734/// - `sa` (`[0..n-1]`): the input suffix array or generalized suffix array (GSA).
12735/// - `lcp` (`[0..n-1]`): the output longest common prefix array (can alias `sa`).
12736/// - `threads`: number of worker threads (can be 0 for the implementation default).
12737///
12738/// Returns 0 on success, -1 on error.
12739pub fn libsais_lcp_omp(
12740    plcp: &[SaSint],
12741    sa: &[SaSint],
12742    lcp: &mut [SaSint],
12743    threads: SaSint,
12744) -> SaSint {
12745    if threads < 0 {
12746        return -1;
12747    }
12748    if plcp.len() != sa.len() || lcp.len() != sa.len() {
12749        return -1;
12750    }
12751    if sa.len() <= 1 {
12752        if sa.len() == 1 {
12753            lcp[0] = plcp[usize::try_from(sa[0]).expect("suffix index must be non-negative")];
12754        }
12755        return 0;
12756    }
12757
12758    compute_lcp_omp(
12759        plcp,
12760        sa,
12761        lcp,
12762        SaSint::try_from(sa.len()).expect("suffix array length must fit SaSint"),
12763        normalize_omp_threads(threads),
12764    );
12765    0
12766}
12767
12768/// Internal helper: unbwt compute histogram.
12769#[doc(hidden)]
12770pub fn unbwt_compute_histogram(t: &[u8], n: FastSint, count: &mut [SaUint]) {
12771    let n = usize::try_from(n).expect("n must be non-negative");
12772    assert!(count.len() >= ALPHABET_SIZE);
12773    for &byte in &t[..n] {
12774        count[byte as usize] += 1;
12775    }
12776}
12777
12778/// Internal helper: unbwt transpose bucket2.
12779#[doc(hidden)]
12780pub fn unbwt_transpose_bucket2(bucket2: &mut [SaUint]) {
12781    assert!(bucket2.len() >= ALPHABET_SIZE * ALPHABET_SIZE);
12782    for x in 0..ALPHABET_SIZE {
12783        for y in x + 1..ALPHABET_SIZE {
12784            bucket2.swap((y << 8) + x, (x << 8) + y);
12785        }
12786    }
12787}
12788
12789/// Internal helper: unbwt compute bigram histogram single.
12790#[doc(hidden)]
12791pub fn unbwt_compute_bigram_histogram_single(
12792    t: &[u8],
12793    bucket1: &mut [SaUint],
12794    bucket2: &mut [SaUint],
12795    index: FastUint,
12796) {
12797    let mut sum = 1usize;
12798    for c in 0..ALPHABET_SIZE {
12799        let prev = sum;
12800        sum += bucket1[c] as usize;
12801        bucket1[c] = prev as SaUint;
12802        if prev != sum {
12803            let bucket2_p = &mut bucket2[c << 8..(c + 1) << 8];
12804
12805            let hi = sum.min(index);
12806            if hi > prev {
12807                unbwt_compute_histogram(&t[prev..], (hi - prev) as FastSint, bucket2_p);
12808            }
12809
12810            let lo = prev.max(index + 1);
12811            if sum > lo {
12812                unbwt_compute_histogram(&t[lo - 1..], (sum - lo) as FastSint, bucket2_p);
12813            }
12814        }
12815    }
12816
12817    unbwt_transpose_bucket2(bucket2);
12818}
12819
12820/// Internal helper: unbwt calculate fastbits.
12821#[doc(hidden)]
12822pub fn unbwt_calculate_fastbits(
12823    bucket2: &mut [SaUint],
12824    fastbits: &mut [u16],
12825    lastc: FastUint,
12826    shift: FastUint,
12827) {
12828    let mut v = 0usize;
12829    let mut w = 0usize;
12830    let mut sum = 1usize;
12831
12832    for c in 0..ALPHABET_SIZE {
12833        if c == lastc {
12834            sum += 1;
12835        }
12836
12837        for _d in 0..ALPHABET_SIZE {
12838            let prev = sum;
12839            sum += bucket2[w] as usize;
12840            bucket2[w] = prev as SaUint;
12841            if prev != sum {
12842                while v <= ((sum - 1) >> shift) {
12843                    fastbits[v] = w as u16;
12844                    v += 1;
12845                }
12846            }
12847            w += 1;
12848        }
12849    }
12850}
12851
12852/// Internal helper: unbwt calculate bi psi.
12853#[doc(hidden)]
12854pub fn unbwt_calculate_bi_psi(
12855    t: &[u8],
12856    p: &mut [SaUint],
12857    bucket1: &mut [SaUint],
12858    bucket2: &mut [SaUint],
12859    index: FastUint,
12860    omp_block_start: FastSint,
12861    omp_block_end: FastSint,
12862) {
12863    let mut i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12864    let mut j = index;
12865    let block_end = usize::try_from(omp_block_end).expect("omp_block_end must be non-negative");
12866    if block_end < j {
12867        j = block_end;
12868    }
12869    while i < j {
12870        let c = t[i] as usize;
12871        let pidx = bucket1[c] as usize;
12872        bucket1[c] += 1;
12873        let tidx = index as isize - pidx as isize;
12874        if tidx != 0 {
12875            let src =
12876                pidx.wrapping_add((tidx >> ((std::mem::size_of::<FastSint>() * 8) - 1)) as usize);
12877            let w = ((t[src] as usize) << 8) + c;
12878            let dst = bucket2[w] as usize;
12879            p[dst] = i as SaUint;
12880            bucket2[w] += 1;
12881        }
12882        i += 1;
12883    }
12884
12885    let mut i = index;
12886    if usize::try_from(omp_block_start).expect("omp_block_start must be non-negative") > i {
12887        i = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12888    }
12889    i += 1;
12890    while i <= block_end {
12891        let c = t[i - 1] as usize;
12892        let pidx = bucket1[c] as usize;
12893        bucket1[c] += 1;
12894        let tidx = index as isize - pidx as isize;
12895        if tidx != 0 {
12896            let src =
12897                pidx.wrapping_add((tidx >> ((std::mem::size_of::<FastSint>() * 8) - 1)) as usize);
12898            let w = ((t[src] as usize) << 8) + c;
12899            let dst = bucket2[w] as usize;
12900            p[dst] = i as SaUint;
12901            bucket2[w] += 1;
12902        }
12903        i += 1;
12904    }
12905}
12906
12907/// Internal helper: unbwt calculate biPSI.
12908#[doc(hidden)]
12909#[allow(dead_code, non_snake_case)]
12910pub fn unbwt_calculate_biPSI(
12911    t: &[u8],
12912    p: &mut [SaUint],
12913    bucket1: &mut [SaUint],
12914    bucket2: &mut [SaUint],
12915    index: FastUint,
12916    omp_block_start: FastSint,
12917    omp_block_end: FastSint,
12918) {
12919    unbwt_calculate_bi_psi(
12920        t,
12921        p,
12922        bucket1,
12923        bucket2,
12924        index,
12925        omp_block_start,
12926        omp_block_end,
12927    );
12928}
12929
12930/// Internal helper: unbwt init single.
12931#[doc(hidden)]
12932pub fn unbwt_init_single(
12933    t: &[u8],
12934    p: &mut [SaUint],
12935    n: SaSint,
12936    freq: Option<&[SaSint]>,
12937    i: &[SaUint],
12938    bucket2: &mut [SaUint],
12939    fastbits: &mut [u16],
12940) {
12941    let mut bucket1 = vec![0u32; ALPHABET_SIZE];
12942    let index = i[0] as usize;
12943    let lastc = t[0] as usize;
12944    let mut shift = 0usize;
12945    while (usize::try_from(n).expect("n must be non-negative") >> shift)
12946        > (1usize << UNBWT_FASTBITS)
12947    {
12948        shift += 1;
12949    }
12950
12951    if let Some(freq) = freq {
12952        for c in 0..ALPHABET_SIZE {
12953            bucket1[c] = freq[c] as SaUint;
12954        }
12955    } else {
12956        unbwt_compute_histogram(t, n as FastSint, &mut bucket1);
12957    }
12958
12959    bucket2.fill(0);
12960    unbwt_compute_bigram_histogram_single(t, &mut bucket1, bucket2, index);
12961    unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
12962    unbwt_calculate_bi_psi(t, p, &mut bucket1, bucket2, index, 0, n as FastSint);
12963}
12964
12965/// Internal helper: unbwt compute bigram histogram parallel.
12966#[doc(hidden)]
12967pub fn unbwt_compute_bigram_histogram_parallel(
12968    t: &[u8],
12969    index: FastUint,
12970    bucket1: &mut [SaUint],
12971    bucket2: &mut [SaUint],
12972    omp_block_start: FastSint,
12973    omp_block_size: FastSint,
12974) {
12975    let start = usize::try_from(omp_block_start).expect("omp_block_start must be non-negative");
12976    let end = start + usize::try_from(omp_block_size).expect("omp_block_size must be non-negative");
12977    for &c_u8 in &t[start..end] {
12978        let c = c_u8 as usize;
12979        let p = bucket1[c] as usize;
12980        bucket1[c] += 1;
12981        let tidx = index as isize - p as isize;
12982        if tidx != 0 {
12983            let src =
12984                p.wrapping_add((tidx >> ((std::mem::size_of::<FastSint>() * 8) - 1)) as usize);
12985            let w = ((t[src] as usize) << 8) + c;
12986            bucket2[w] += 1;
12987        }
12988    }
12989}
12990
12991/// Internal helper: unbwt init parallel.
12992#[doc(hidden)]
12993pub fn unbwt_init_parallel(
12994    t: &[u8],
12995    p: &mut [SaUint],
12996    n: SaSint,
12997    freq: Option<&[SaSint]>,
12998    i: &[SaUint],
12999    bucket2: &mut [SaUint],
13000    fastbits: &mut [u16],
13001    buckets: Option<&mut [SaUint]>,
13002    threads: SaSint,
13003) {
13004    let num_threads = usize::try_from(threads.max(1)).expect("threads must be non-negative");
13005    if num_threads <= 1 || usize::try_from(n).expect("n must be non-negative") < 65_536 {
13006        unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
13007        return;
13008    }
13009
13010    let buckets = match buckets {
13011        Some(buckets) => buckets,
13012        None => {
13013            unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
13014            return;
13015        }
13016    };
13017
13018    let segment_len = ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE;
13019    assert!(buckets.len() >= num_threads * segment_len);
13020
13021    let index = i[0] as usize;
13022    let lastc = t[0] as usize;
13023    let mut shift = 0usize;
13024    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13025        > (1usize << UNBWT_FASTBITS)
13026    {
13027        shift += 1;
13028    }
13029
13030    let mut bucket1 = vec![0u32; ALPHABET_SIZE];
13031    bucket2.fill(0);
13032
13033    let n_fast = n as FastSint;
13034    let block_stride = (n_fast / num_threads as FastSint) & (-16);
13035    let mut block_starts = vec![0usize; num_threads];
13036    let mut block_sizes = vec![0usize; num_threads];
13037
13038    for thread in 0..num_threads {
13039        let start = usize::try_from(thread as FastSint * block_stride)
13040            .expect("block start must be non-negative");
13041        let size = if thread + 1 < num_threads {
13042            usize::try_from(block_stride).expect("block stride must be non-negative")
13043        } else {
13044            usize::try_from(n_fast - thread as FastSint * block_stride)
13045                .expect("block size must be non-negative")
13046        };
13047        block_starts[thread] = start;
13048        block_sizes[thread] = size;
13049
13050        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13051        let (bucket1_local, _) = segment.split_at_mut(ALPHABET_SIZE);
13052        bucket1_local.fill(0);
13053        unbwt_compute_histogram(&t[start..], size as FastSint, bucket1_local);
13054    }
13055
13056    for thread in 0..num_threads {
13057        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13058        let (bucket1_temp, _) = segment.split_at_mut(ALPHABET_SIZE);
13059        for c in 0..ALPHABET_SIZE {
13060            let a = bucket1[c];
13061            let b = bucket1_temp[c];
13062            bucket1[c] = a + b;
13063            bucket1_temp[c] = a;
13064        }
13065    }
13066
13067    let mut sum = 1usize;
13068    for c in 0..ALPHABET_SIZE {
13069        let prev = sum;
13070        sum += bucket1[c] as usize;
13071        bucket1[c] = prev as SaUint;
13072    }
13073
13074    for thread in 0..num_threads {
13075        let start = block_starts[thread];
13076        let size = block_sizes[thread];
13077        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13078        let (bucket1_local, bucket2_local) = segment.split_at_mut(ALPHABET_SIZE);
13079        for c in 0..ALPHABET_SIZE {
13080            bucket1_local[c] += bucket1[c];
13081        }
13082        bucket2_local.fill(0);
13083        unbwt_compute_bigram_histogram_parallel(
13084            t,
13085            index,
13086            bucket1_local,
13087            bucket2_local,
13088            start as FastSint,
13089            size as FastSint,
13090        );
13091    }
13092
13093    for thread in 0..num_threads {
13094        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13095        let (_, bucket2_temp) = segment.split_at_mut(ALPHABET_SIZE);
13096        for c in 0..ALPHABET_SIZE * ALPHABET_SIZE {
13097            let a = bucket2[c];
13098            let b = bucket2_temp[c];
13099            bucket2[c] = a + b;
13100            bucket2_temp[c] = a;
13101        }
13102    }
13103
13104    unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
13105
13106    for thread in (1..num_threads).rev() {
13107        let src_start = (thread - 1) * segment_len;
13108        let dst_start = thread * segment_len;
13109        let (head, tail) = buckets.split_at_mut(dst_start);
13110        let src = &head[src_start..src_start + ALPHABET_SIZE];
13111        let dst = &mut tail[..ALPHABET_SIZE];
13112        dst.copy_from_slice(src);
13113    }
13114    buckets[..ALPHABET_SIZE].copy_from_slice(&bucket1);
13115
13116    for thread in 0..num_threads {
13117        let start = block_starts[thread];
13118        let size = block_sizes[thread];
13119        let segment = &mut buckets[thread * segment_len..(thread + 1) * segment_len];
13120        let (bucket1_local, bucket2_local) = segment.split_at_mut(ALPHABET_SIZE);
13121        for c in 0..ALPHABET_SIZE * ALPHABET_SIZE {
13122            bucket2_local[c] += bucket2[c];
13123        }
13124        unbwt_calculate_bi_psi(
13125            t,
13126            p,
13127            bucket1_local,
13128            bucket2_local,
13129            index,
13130            start as FastSint,
13131            (start + size) as FastSint,
13132        );
13133    }
13134
13135    let last_segment = &buckets[(num_threads - 1) * segment_len..num_threads * segment_len];
13136    let (_, last_bucket2) = last_segment.split_at(ALPHABET_SIZE);
13137    bucket2.copy_from_slice(last_bucket2);
13138}
13139
13140fn bswap16(value: u16) -> u16 {
13141    value.swap_bytes()
13142}
13143
13144fn unbwt_resolve_symbol(bucket2: &[SaUint], fastbits: &[u16], shift: FastUint, p: SaUint) -> u16 {
13145    let mut c = fastbits[(p as usize) >> shift];
13146    while bucket2[c as usize] <= p {
13147        c += 1;
13148    }
13149    c
13150}
13151
13152/// Internal helper: unbwt decode 1.
13153#[doc(hidden)]
13154pub fn unbwt_decode_1(
13155    u: &mut [u8],
13156    p: &[SaUint],
13157    bucket2: &[SaUint],
13158    fastbits: &[u16],
13159    shift: FastUint,
13160    i0: &mut FastUint,
13161    k: FastUint,
13162) {
13163    let words = &mut u[..2 * k];
13164    let mut p0 = *i0 as SaUint;
13165
13166    for i in 0..k {
13167        let c0 = unbwt_resolve_symbol(bucket2, fastbits, shift, p0);
13168        p0 = p[p0 as usize];
13169        let bytes = bswap16(c0).to_ne_bytes();
13170        words[2 * i] = bytes[0];
13171        words[2 * i + 1] = bytes[1];
13172    }
13173
13174    *i0 = p0 as FastUint;
13175}
13176
13177/// Internal helper: unbwt decode 2.
13178#[doc(hidden)]
13179pub fn unbwt_decode_2(
13180    u: &mut [u8],
13181    p: &[SaUint],
13182    bucket2: &[SaUint],
13183    fastbits: &[u16],
13184    shift: FastUint,
13185    r: FastUint,
13186    i0: &mut FastUint,
13187    i1: &mut FastUint,
13188    k: FastUint,
13189) {
13190    let width = 2 * k;
13191    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13192    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13193}
13194
13195/// Internal helper: unbwt decode 3.
13196#[doc(hidden)]
13197pub fn unbwt_decode_3(
13198    u: &mut [u8],
13199    p: &[SaUint],
13200    bucket2: &[SaUint],
13201    fastbits: &[u16],
13202    shift: FastUint,
13203    r: FastUint,
13204    i0: &mut FastUint,
13205    i1: &mut FastUint,
13206    i2: &mut FastUint,
13207    k: FastUint,
13208) {
13209    let width = 2 * k;
13210    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13211    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13212    unbwt_decode_1(
13213        &mut u[2 * r..2 * r + width],
13214        p,
13215        bucket2,
13216        fastbits,
13217        shift,
13218        i2,
13219        k,
13220    );
13221}
13222
13223/// Internal helper: unbwt decode 4.
13224#[doc(hidden)]
13225pub fn unbwt_decode_4(
13226    u: &mut [u8],
13227    p: &[SaUint],
13228    bucket2: &[SaUint],
13229    fastbits: &[u16],
13230    shift: FastUint,
13231    r: FastUint,
13232    i0: &mut FastUint,
13233    i1: &mut FastUint,
13234    i2: &mut FastUint,
13235    i3: &mut FastUint,
13236    k: FastUint,
13237) {
13238    let width = 2 * k;
13239    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13240    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13241    unbwt_decode_1(
13242        &mut u[2 * r..2 * r + width],
13243        p,
13244        bucket2,
13245        fastbits,
13246        shift,
13247        i2,
13248        k,
13249    );
13250    unbwt_decode_1(
13251        &mut u[3 * r..3 * r + width],
13252        p,
13253        bucket2,
13254        fastbits,
13255        shift,
13256        i3,
13257        k,
13258    );
13259}
13260
13261/// Internal helper: unbwt decode 5.
13262#[doc(hidden)]
13263pub fn unbwt_decode_5(
13264    u: &mut [u8],
13265    p: &[SaUint],
13266    bucket2: &[SaUint],
13267    fastbits: &[u16],
13268    shift: FastUint,
13269    r: FastUint,
13270    i0: &mut FastUint,
13271    i1: &mut FastUint,
13272    i2: &mut FastUint,
13273    i3: &mut FastUint,
13274    i4: &mut FastUint,
13275    k: FastUint,
13276) {
13277    let width = 2 * k;
13278    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13279    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13280    unbwt_decode_1(
13281        &mut u[2 * r..2 * r + width],
13282        p,
13283        bucket2,
13284        fastbits,
13285        shift,
13286        i2,
13287        k,
13288    );
13289    unbwt_decode_1(
13290        &mut u[3 * r..3 * r + width],
13291        p,
13292        bucket2,
13293        fastbits,
13294        shift,
13295        i3,
13296        k,
13297    );
13298    unbwt_decode_1(
13299        &mut u[4 * r..4 * r + width],
13300        p,
13301        bucket2,
13302        fastbits,
13303        shift,
13304        i4,
13305        k,
13306    );
13307}
13308
13309/// Internal helper: unbwt decode 6.
13310#[doc(hidden)]
13311pub fn unbwt_decode_6(
13312    u: &mut [u8],
13313    p: &[SaUint],
13314    bucket2: &[SaUint],
13315    fastbits: &[u16],
13316    shift: FastUint,
13317    r: FastUint,
13318    i0: &mut FastUint,
13319    i1: &mut FastUint,
13320    i2: &mut FastUint,
13321    i3: &mut FastUint,
13322    i4: &mut FastUint,
13323    i5: &mut FastUint,
13324    k: FastUint,
13325) {
13326    let width = 2 * k;
13327    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13328    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13329    unbwt_decode_1(
13330        &mut u[2 * r..2 * r + width],
13331        p,
13332        bucket2,
13333        fastbits,
13334        shift,
13335        i2,
13336        k,
13337    );
13338    unbwt_decode_1(
13339        &mut u[3 * r..3 * r + width],
13340        p,
13341        bucket2,
13342        fastbits,
13343        shift,
13344        i3,
13345        k,
13346    );
13347    unbwt_decode_1(
13348        &mut u[4 * r..4 * r + width],
13349        p,
13350        bucket2,
13351        fastbits,
13352        shift,
13353        i4,
13354        k,
13355    );
13356    unbwt_decode_1(
13357        &mut u[5 * r..5 * r + width],
13358        p,
13359        bucket2,
13360        fastbits,
13361        shift,
13362        i5,
13363        k,
13364    );
13365}
13366
13367/// Internal helper: unbwt decode 7.
13368#[doc(hidden)]
13369pub fn unbwt_decode_7(
13370    u: &mut [u8],
13371    p: &[SaUint],
13372    bucket2: &[SaUint],
13373    fastbits: &[u16],
13374    shift: FastUint,
13375    r: FastUint,
13376    i0: &mut FastUint,
13377    i1: &mut FastUint,
13378    i2: &mut FastUint,
13379    i3: &mut FastUint,
13380    i4: &mut FastUint,
13381    i5: &mut FastUint,
13382    i6: &mut FastUint,
13383    k: FastUint,
13384) {
13385    let width = 2 * k;
13386    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13387    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13388    unbwt_decode_1(
13389        &mut u[2 * r..2 * r + width],
13390        p,
13391        bucket2,
13392        fastbits,
13393        shift,
13394        i2,
13395        k,
13396    );
13397    unbwt_decode_1(
13398        &mut u[3 * r..3 * r + width],
13399        p,
13400        bucket2,
13401        fastbits,
13402        shift,
13403        i3,
13404        k,
13405    );
13406    unbwt_decode_1(
13407        &mut u[4 * r..4 * r + width],
13408        p,
13409        bucket2,
13410        fastbits,
13411        shift,
13412        i4,
13413        k,
13414    );
13415    unbwt_decode_1(
13416        &mut u[5 * r..5 * r + width],
13417        p,
13418        bucket2,
13419        fastbits,
13420        shift,
13421        i5,
13422        k,
13423    );
13424    unbwt_decode_1(
13425        &mut u[6 * r..6 * r + width],
13426        p,
13427        bucket2,
13428        fastbits,
13429        shift,
13430        i6,
13431        k,
13432    );
13433}
13434
13435/// Internal helper: unbwt decode 8.
13436#[doc(hidden)]
13437pub fn unbwt_decode_8(
13438    u: &mut [u8],
13439    p: &[SaUint],
13440    bucket2: &[SaUint],
13441    fastbits: &[u16],
13442    shift: FastUint,
13443    r: FastUint,
13444    i0: &mut FastUint,
13445    i1: &mut FastUint,
13446    i2: &mut FastUint,
13447    i3: &mut FastUint,
13448    i4: &mut FastUint,
13449    i5: &mut FastUint,
13450    i6: &mut FastUint,
13451    i7: &mut FastUint,
13452    k: FastUint,
13453) {
13454    let width = 2 * k;
13455    unbwt_decode_1(&mut u[0..width], p, bucket2, fastbits, shift, i0, k);
13456    unbwt_decode_1(&mut u[r..r + width], p, bucket2, fastbits, shift, i1, k);
13457    unbwt_decode_1(
13458        &mut u[2 * r..2 * r + width],
13459        p,
13460        bucket2,
13461        fastbits,
13462        shift,
13463        i2,
13464        k,
13465    );
13466    unbwt_decode_1(
13467        &mut u[3 * r..3 * r + width],
13468        p,
13469        bucket2,
13470        fastbits,
13471        shift,
13472        i3,
13473        k,
13474    );
13475    unbwt_decode_1(
13476        &mut u[4 * r..4 * r + width],
13477        p,
13478        bucket2,
13479        fastbits,
13480        shift,
13481        i4,
13482        k,
13483    );
13484    unbwt_decode_1(
13485        &mut u[5 * r..5 * r + width],
13486        p,
13487        bucket2,
13488        fastbits,
13489        shift,
13490        i5,
13491        k,
13492    );
13493    unbwt_decode_1(
13494        &mut u[6 * r..6 * r + width],
13495        p,
13496        bucket2,
13497        fastbits,
13498        shift,
13499        i6,
13500        k,
13501    );
13502    unbwt_decode_1(
13503        &mut u[7 * r..7 * r + width],
13504        p,
13505        bucket2,
13506        fastbits,
13507        shift,
13508        i7,
13509        k,
13510    );
13511}
13512
13513/// Internal helper: unbwt decode.
13514#[doc(hidden)]
13515pub fn unbwt_decode(
13516    u: &mut [u8],
13517    p: &[SaUint],
13518    n: SaSint,
13519    r: SaSint,
13520    i: &[SaUint],
13521    bucket2: &[SaUint],
13522    fastbits: &[u16],
13523    mut blocks: FastSint,
13524    remainder: FastUint,
13525) {
13526    let mut shift = 0usize;
13527    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13528        > (1usize << UNBWT_FASTBITS)
13529    {
13530        shift += 1;
13531    }
13532    let mut offset = 0usize;
13533    let mut i_index = 0usize;
13534    let r_usize = usize::try_from(r).expect("r must be non-negative");
13535
13536    while blocks > 8 {
13537        let mut i0 = i[i_index] as FastUint;
13538        let mut i1 = i[i_index + 1] as FastUint;
13539        let mut i2 = i[i_index + 2] as FastUint;
13540        let mut i3 = i[i_index + 3] as FastUint;
13541        let mut i4 = i[i_index + 4] as FastUint;
13542        let mut i5 = i[i_index + 5] as FastUint;
13543        let mut i6 = i[i_index + 6] as FastUint;
13544        let mut i7 = i[i_index + 7] as FastUint;
13545        unbwt_decode_8(
13546            &mut u[offset..],
13547            p,
13548            bucket2,
13549            fastbits,
13550            shift,
13551            r_usize,
13552            &mut i0,
13553            &mut i1,
13554            &mut i2,
13555            &mut i3,
13556            &mut i4,
13557            &mut i5,
13558            &mut i6,
13559            &mut i7,
13560            r_usize >> 1,
13561        );
13562        i_index += 8;
13563        blocks -= 8;
13564        offset += 8 * r_usize;
13565    }
13566
13567    match blocks {
13568        1 => {
13569            let mut i0 = i[i_index] as FastUint;
13570            unbwt_decode_1(
13571                &mut u[offset..],
13572                p,
13573                bucket2,
13574                fastbits,
13575                shift,
13576                &mut i0,
13577                remainder >> 1,
13578            );
13579        }
13580        2 => {
13581            let mut i0 = i[i_index] as FastUint;
13582            let mut i1 = i[i_index + 1] as FastUint;
13583            unbwt_decode_2(
13584                &mut u[offset..],
13585                p,
13586                bucket2,
13587                fastbits,
13588                shift,
13589                r_usize,
13590                &mut i0,
13591                &mut i1,
13592                remainder >> 1,
13593            );
13594            unbwt_decode_1(
13595                &mut u[offset + 2 * (remainder >> 1)..],
13596                p,
13597                bucket2,
13598                fastbits,
13599                shift,
13600                &mut i0,
13601                (r_usize >> 1) - (remainder >> 1),
13602            );
13603        }
13604        3 => {
13605            let mut i0 = i[i_index] as FastUint;
13606            let mut i1 = i[i_index + 1] as FastUint;
13607            let mut i2 = i[i_index + 2] as FastUint;
13608            unbwt_decode_3(
13609                &mut u[offset..],
13610                p,
13611                bucket2,
13612                fastbits,
13613                shift,
13614                r_usize,
13615                &mut i0,
13616                &mut i1,
13617                &mut i2,
13618                remainder >> 1,
13619            );
13620            unbwt_decode_2(
13621                &mut u[offset + 2 * (remainder >> 1)..],
13622                p,
13623                bucket2,
13624                fastbits,
13625                shift,
13626                r_usize,
13627                &mut i0,
13628                &mut i1,
13629                (r_usize >> 1) - (remainder >> 1),
13630            );
13631        }
13632        4 => {
13633            let mut i0 = i[i_index] as FastUint;
13634            let mut i1 = i[i_index + 1] as FastUint;
13635            let mut i2 = i[i_index + 2] as FastUint;
13636            let mut i3 = i[i_index + 3] as FastUint;
13637            unbwt_decode_4(
13638                &mut u[offset..],
13639                p,
13640                bucket2,
13641                fastbits,
13642                shift,
13643                r_usize,
13644                &mut i0,
13645                &mut i1,
13646                &mut i2,
13647                &mut i3,
13648                remainder >> 1,
13649            );
13650            unbwt_decode_3(
13651                &mut u[offset + 2 * (remainder >> 1)..],
13652                p,
13653                bucket2,
13654                fastbits,
13655                shift,
13656                r_usize,
13657                &mut i0,
13658                &mut i1,
13659                &mut i2,
13660                (r_usize >> 1) - (remainder >> 1),
13661            );
13662        }
13663        5 => {
13664            let mut i0 = i[i_index] as FastUint;
13665            let mut i1 = i[i_index + 1] as FastUint;
13666            let mut i2 = i[i_index + 2] as FastUint;
13667            let mut i3 = i[i_index + 3] as FastUint;
13668            let mut i4 = i[i_index + 4] as FastUint;
13669            unbwt_decode_5(
13670                &mut u[offset..],
13671                p,
13672                bucket2,
13673                fastbits,
13674                shift,
13675                r_usize,
13676                &mut i0,
13677                &mut i1,
13678                &mut i2,
13679                &mut i3,
13680                &mut i4,
13681                remainder >> 1,
13682            );
13683            unbwt_decode_4(
13684                &mut u[offset + 2 * (remainder >> 1)..],
13685                p,
13686                bucket2,
13687                fastbits,
13688                shift,
13689                r_usize,
13690                &mut i0,
13691                &mut i1,
13692                &mut i2,
13693                &mut i3,
13694                (r_usize >> 1) - (remainder >> 1),
13695            );
13696        }
13697        6 => {
13698            let mut i0 = i[i_index] as FastUint;
13699            let mut i1 = i[i_index + 1] as FastUint;
13700            let mut i2 = i[i_index + 2] as FastUint;
13701            let mut i3 = i[i_index + 3] as FastUint;
13702            let mut i4 = i[i_index + 4] as FastUint;
13703            let mut i5 = i[i_index + 5] as FastUint;
13704            unbwt_decode_6(
13705                &mut u[offset..],
13706                p,
13707                bucket2,
13708                fastbits,
13709                shift,
13710                r_usize,
13711                &mut i0,
13712                &mut i1,
13713                &mut i2,
13714                &mut i3,
13715                &mut i4,
13716                &mut i5,
13717                remainder >> 1,
13718            );
13719            unbwt_decode_5(
13720                &mut u[offset + 2 * (remainder >> 1)..],
13721                p,
13722                bucket2,
13723                fastbits,
13724                shift,
13725                r_usize,
13726                &mut i0,
13727                &mut i1,
13728                &mut i2,
13729                &mut i3,
13730                &mut i4,
13731                (r_usize >> 1) - (remainder >> 1),
13732            );
13733        }
13734        7 => {
13735            let mut i0 = i[i_index] as FastUint;
13736            let mut i1 = i[i_index + 1] as FastUint;
13737            let mut i2 = i[i_index + 2] as FastUint;
13738            let mut i3 = i[i_index + 3] as FastUint;
13739            let mut i4 = i[i_index + 4] as FastUint;
13740            let mut i5 = i[i_index + 5] as FastUint;
13741            let mut i6 = i[i_index + 6] as FastUint;
13742            unbwt_decode_7(
13743                &mut u[offset..],
13744                p,
13745                bucket2,
13746                fastbits,
13747                shift,
13748                r_usize,
13749                &mut i0,
13750                &mut i1,
13751                &mut i2,
13752                &mut i3,
13753                &mut i4,
13754                &mut i5,
13755                &mut i6,
13756                remainder >> 1,
13757            );
13758            unbwt_decode_6(
13759                &mut u[offset + 2 * (remainder >> 1)..],
13760                p,
13761                bucket2,
13762                fastbits,
13763                shift,
13764                r_usize,
13765                &mut i0,
13766                &mut i1,
13767                &mut i2,
13768                &mut i3,
13769                &mut i4,
13770                &mut i5,
13771                (r_usize >> 1) - (remainder >> 1),
13772            );
13773        }
13774        8 => {
13775            let mut i0 = i[i_index] as FastUint;
13776            let mut i1 = i[i_index + 1] as FastUint;
13777            let mut i2 = i[i_index + 2] as FastUint;
13778            let mut i3 = i[i_index + 3] as FastUint;
13779            let mut i4 = i[i_index + 4] as FastUint;
13780            let mut i5 = i[i_index + 5] as FastUint;
13781            let mut i6 = i[i_index + 6] as FastUint;
13782            let mut i7 = i[i_index + 7] as FastUint;
13783            unbwt_decode_8(
13784                &mut u[offset..],
13785                p,
13786                bucket2,
13787                fastbits,
13788                shift,
13789                r_usize,
13790                &mut i0,
13791                &mut i1,
13792                &mut i2,
13793                &mut i3,
13794                &mut i4,
13795                &mut i5,
13796                &mut i6,
13797                &mut i7,
13798                remainder >> 1,
13799            );
13800            unbwt_decode_7(
13801                &mut u[offset + 2 * (remainder >> 1)..],
13802                p,
13803                bucket2,
13804                fastbits,
13805                shift,
13806                r_usize,
13807                &mut i0,
13808                &mut i1,
13809                &mut i2,
13810                &mut i3,
13811                &mut i4,
13812                &mut i5,
13813                &mut i6,
13814                (r_usize >> 1) - (remainder >> 1),
13815            );
13816        }
13817        _ => {}
13818    }
13819}
13820
13821/// Internal helper: unbwt decode (OpenMP variant).
13822#[doc(hidden)]
13823pub fn unbwt_decode_omp(
13824    t: &[u8],
13825    u: &mut [u8],
13826    p: &[SaUint],
13827    n: SaSint,
13828    r: SaSint,
13829    i: &[SaUint],
13830    bucket2: &[SaUint],
13831    fastbits: &[u16],
13832    threads: SaSint,
13833) {
13834    let lastc = t[0];
13835    let blocks = 1 + ((n as FastSint - 1) / r as FastSint);
13836    let remainder = usize::try_from(n).expect("n must be non-negative")
13837        - usize::try_from(r).expect("r must be non-negative")
13838            * (usize::try_from(blocks).expect("blocks") - 1);
13839    let max_threads = usize::try_from(blocks.min(threads.max(1) as FastSint))
13840        .expect("thread count must fit usize");
13841    let block_stride = usize::try_from(blocks).expect("blocks must be non-negative") / max_threads;
13842    let block_remainder =
13843        usize::try_from(blocks).expect("blocks must be non-negative") % max_threads;
13844    let r_usize = usize::try_from(r).expect("r must be non-negative");
13845
13846    for thread in 0..max_threads {
13847        let block_size = block_stride + usize::from(thread < block_remainder);
13848        let block_start = block_stride * thread + thread.min(block_remainder);
13849        unbwt_decode(
13850            &mut u[r_usize * block_start..],
13851            p,
13852            n,
13853            r,
13854            &i[block_start..],
13855            bucket2,
13856            fastbits,
13857            block_size as FastSint,
13858            if thread + 1 < max_threads {
13859                r_usize
13860            } else {
13861                remainder
13862            },
13863        );
13864    }
13865    u[usize::try_from(n).expect("n must be non-negative") - 1] = lastc;
13866}
13867
13868/// Internal helper: unbwt core.
13869#[doc(hidden)]
13870pub fn unbwt_core(
13871    t: &[u8],
13872    u: &mut [u8],
13873    p: &mut [SaUint],
13874    n: SaSint,
13875    freq: Option<&[SaSint]>,
13876    r: SaSint,
13877    i: &[SaUint],
13878    bucket2: &mut [SaUint],
13879    fastbits: &mut [u16],
13880    buckets: Option<&mut [SaUint]>,
13881    threads: SaSint,
13882) -> SaSint {
13883    if threads > 1 && n >= 262_144 {
13884        unbwt_init_parallel(t, p, n, freq, i, bucket2, fastbits, buckets, threads);
13885    } else {
13886        unbwt_init_single(t, p, n, freq, i, bucket2, fastbits);
13887    }
13888
13889    unbwt_decode_omp(t, u, p, n, r, i, bucket2, fastbits, threads);
13890    0
13891}
13892
13893/// Internal helper: unbwt main.
13894#[doc(hidden)]
13895pub fn unbwt_main(
13896    t: &[u8],
13897    u: &mut [u8],
13898    p: &mut [SaUint],
13899    n: SaSint,
13900    freq: Option<&[SaSint]>,
13901    r: SaSint,
13902    i: &[SaUint],
13903    threads: SaSint,
13904) -> SaSint {
13905    let mut shift = 0usize;
13906    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13907        > (1usize << UNBWT_FASTBITS)
13908    {
13909        shift += 1;
13910    }
13911
13912    let mut bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
13913    let mut fastbits =
13914        vec![0u16; 1 + (usize::try_from(n).expect("n must be non-negative") >> shift)];
13915    let mut buckets = if threads > 1 && n >= 262_144 {
13916        Some(vec![
13917            0u32;
13918            usize::try_from(threads)
13919                .expect("threads must be non-negative")
13920                * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)
13921        ])
13922    } else {
13923        None
13924    };
13925
13926    unbwt_core(
13927        t,
13928        u,
13929        p,
13930        n,
13931        freq,
13932        r,
13933        i,
13934        &mut bucket2,
13935        &mut fastbits,
13936        buckets.as_deref_mut(),
13937        threads,
13938    )
13939}
13940
13941/// Internal helper: unbwt main ctx.
13942#[doc(hidden)]
13943pub fn unbwt_main_ctx(
13944    ctx: &mut UnbwtContext,
13945    t: &[u8],
13946    u: &mut [u8],
13947    p: &mut [SaUint],
13948    n: SaSint,
13949    freq: Option<&[SaSint]>,
13950    r: SaSint,
13951    i: &[SaUint],
13952) -> SaSint {
13953    if ctx.threads <= 0 {
13954        return -2;
13955    }
13956    let mut shift = 0usize;
13957    while (usize::try_from(n).expect("n must be non-negative") >> shift)
13958        > (1usize << UNBWT_FASTBITS)
13959    {
13960        shift += 1;
13961    }
13962    let required_fastbits = 1 + (usize::try_from(n).expect("n must be non-negative") >> shift);
13963    if ctx.bucket2.len() < ALPHABET_SIZE * ALPHABET_SIZE
13964        || ctx.fastbits.len() < required_fastbits
13965        || (ctx.threads > 1 && ctx.buckets.is_none())
13966    {
13967        return -2;
13968    }
13969
13970    unbwt_core(
13971        t,
13972        u,
13973        p,
13974        n,
13975        freq,
13976        r,
13977        i,
13978        &mut ctx.bucket2,
13979        &mut ctx.fastbits,
13980        ctx.buckets.as_deref_mut(),
13981        ctx.threads as SaSint,
13982    )
13983}
13984
13985/// Reconstructs the original string from a given BWT and primary index.
13986///
13987/// - `t` (`[0..n-1]`): the input string.
13988/// - `u` (`[0..n-1]`): the output string (can alias `t`).
13989/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
13990/// - `freq` (`[0..255]`): optional input symbol frequency table.
13991/// - `i`: the primary index.
13992///
13993/// Returns 0 on success, -1 or -2 on error.
13994pub fn libsais_unbwt(
13995    t: &[u8],
13996    u: &mut [u8],
13997    a: &mut [SaSint],
13998    freq: Option<&[SaSint]>,
13999    i: SaSint,
14000) -> SaSint {
14001    libsais_unbwt_aux(
14002        t,
14003        u,
14004        a,
14005        freq,
14006        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14007        &[i],
14008    )
14009}
14010
14011/// Reconstructs the original string from a given BWT and primary index using a libsais reverse-BWT context.
14012///
14013/// - `ctx`: the libsais reverse-BWT context.
14014/// - `t` (`[0..n-1]`): the input string.
14015/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14016/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14017/// - `freq` (`[0..255]`): optional input symbol frequency table.
14018/// - `i`: the primary index.
14019///
14020/// Returns 0 on success, -1 or -2 on error.
14021pub fn libsais_unbwt_ctx(
14022    ctx: &mut UnbwtContext,
14023    t: &[u8],
14024    u: &mut [u8],
14025    a: &mut [SaSint],
14026    freq: Option<&[SaSint]>,
14027    i: SaSint,
14028) -> SaSint {
14029    libsais_unbwt_aux_ctx(
14030        ctx,
14031        t,
14032        u,
14033        a,
14034        freq,
14035        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14036        &[i],
14037    )
14038}
14039
14040/// Reconstructs the original string from a given BWT with auxiliary indexes.
14041///
14042/// - `t` (`[0..n-1]`): the input string.
14043/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14044/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14045/// - `freq` (`[0..255]`): optional input symbol frequency table.
14046/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14047/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14048///
14049/// Returns 0 on success, -1 or -2 on error.
14050pub fn libsais_unbwt_aux(
14051    t: &[u8],
14052    u: &mut [u8],
14053    a: &mut [SaSint],
14054    freq: Option<&[SaSint]>,
14055    r: SaSint,
14056    i: &[SaSint],
14057) -> SaSint {
14058    let t_len = t.len();
14059    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14060    if u.len() < t_len
14061        || a.len() < t_len
14062        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14063        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14064    {
14065        return -1;
14066    }
14067    let sample_count = if n == 0 {
14068        1
14069    } else {
14070        ((n - 1) / r + 1) as usize
14071    };
14072    if i.len() < sample_count {
14073        return -1;
14074    }
14075
14076    if n <= 1 {
14077        if i[0] != n {
14078            return -1;
14079        }
14080        if n == 1 {
14081            u[0] = t[0];
14082        }
14083        return 0;
14084    }
14085
14086    for t in 0..sample_count {
14087        let sample = i[t];
14088        if sample <= 0 || sample > n {
14089            return -1;
14090        }
14091    }
14092
14093    let i_u32: Vec<SaUint> = i
14094        .iter()
14095        .take(sample_count)
14096        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14097        .collect();
14098    let mut p = vec![0u32; t_len + 1];
14099    let result = unbwt_main(t, u, &mut p, n, freq, r, &i_u32, 1);
14100    for t in 0..t_len {
14101        a[t] = p[t] as SaSint;
14102    }
14103    result
14104}
14105
14106/// Reconstructs the original string from a given BWT with auxiliary indexes using a libsais reverse-BWT context.
14107///
14108/// - `ctx`: the libsais reverse-BWT context.
14109/// - `t` (`[0..n-1]`): the input string.
14110/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14111/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14112/// - `freq` (`[0..255]`): optional input symbol frequency table.
14113/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14114/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14115///
14116/// Returns 0 on success, -1 or -2 on error.
14117pub fn libsais_unbwt_aux_ctx(
14118    ctx: &mut UnbwtContext,
14119    t: &[u8],
14120    u: &mut [u8],
14121    a: &mut [SaSint],
14122    freq: Option<&[SaSint]>,
14123    r: SaSint,
14124    i: &[SaSint],
14125) -> SaSint {
14126    let t_len = t.len();
14127    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14128    if u.len() < t_len
14129        || a.len() < t_len
14130        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14131        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14132    {
14133        return -1;
14134    }
14135    let sample_count = if n == 0 {
14136        1
14137    } else {
14138        ((n - 1) / r + 1) as usize
14139    };
14140    if i.len() < sample_count {
14141        return -1;
14142    }
14143
14144    if n <= 1 {
14145        if i[0] != n {
14146            return -1;
14147        }
14148        if n == 1 {
14149            u[0] = t[0];
14150        }
14151        return 0;
14152    }
14153
14154    for t in 0..sample_count {
14155        let sample = i[t];
14156        if sample <= 0 || sample > n {
14157            return -1;
14158        }
14159    }
14160
14161    let i_u32: Vec<SaUint> = i
14162        .iter()
14163        .take(sample_count)
14164        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14165        .collect();
14166    let mut p = vec![0u32; t_len + 1];
14167    let result = unbwt_main_ctx(ctx, t, u, &mut p, n, freq, r, &i_u32);
14168    for t in 0..t_len {
14169        a[t] = p[t] as SaSint;
14170    }
14171    result
14172}
14173
14174/// Creates the libsais reverse-BWT context for parallel `libsais_unbwt_*` operations using OpenMP-style threading.
14175///
14176/// In multi-threaded environments, use one context per thread for parallel executions.
14177///
14178/// - `threads`: number of worker threads (can be 0 for the implementation default).
14179///
14180/// Returns the context, or `None` on allocation failure.
14181pub fn unbwt_create_ctx_omp(threads: SaSint) -> Option<UnbwtContext> {
14182    if threads < 0 {
14183        return None;
14184    }
14185    unbwt_create_ctx_main(normalize_omp_threads(threads))
14186}
14187
14188/// Reconstructs the original string from a given BWT and primary index in parallel using OpenMP-style threading.
14189///
14190/// - `t` (`[0..n-1]`): the input string.
14191/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14192/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14193/// - `freq` (`[0..255]`): optional input symbol frequency table.
14194/// - `i`: the primary index.
14195/// - `threads`: number of worker threads (can be 0 for the implementation default).
14196///
14197/// Returns 0 on success, -1 or -2 on error.
14198pub fn libsais_unbwt_omp(
14199    t: &[u8],
14200    u: &mut [u8],
14201    a: &mut [SaSint],
14202    freq: Option<&[SaSint]>,
14203    i: SaSint,
14204    threads: SaSint,
14205) -> SaSint {
14206    libsais_unbwt_aux_omp(
14207        t,
14208        u,
14209        a,
14210        freq,
14211        SaSint::try_from(t.len()).expect("input length must fit SaSint"),
14212        &[i],
14213        threads,
14214    )
14215}
14216
14217/// Reconstructs the original string from a given BWT with auxiliary indexes in parallel using OpenMP-style threading.
14218///
14219/// - `t` (`[0..n-1]`): the input string.
14220/// - `u` (`[0..n-1]`): the output string (can alias `t`).
14221/// - `a` (`[0..n]`): the temporary array (must have length `n + 1`).
14222/// - `freq` (`[0..255]`): optional input symbol frequency table.
14223/// - `r`: sampling rate for the auxiliary indexes (must be a power of two).
14224/// - `i` (`[0..(n-1)/r]`): input auxiliary indexes.
14225/// - `threads`: number of worker threads (can be 0 for the implementation default).
14226///
14227/// Returns 0 on success, -1 or -2 on error.
14228pub fn libsais_unbwt_aux_omp(
14229    t: &[u8],
14230    u: &mut [u8],
14231    a: &mut [SaSint],
14232    freq: Option<&[SaSint]>,
14233    r: SaSint,
14234    i: &[SaSint],
14235    threads: SaSint,
14236) -> SaSint {
14237    let t_len = t.len();
14238    let n = SaSint::try_from(t_len).expect("input length must fit SaSint");
14239    if threads < 0
14240        || u.len() < t_len
14241        || a.len() < t_len
14242        || freq.is_some_and(|freq| freq.len() < ALPHABET_SIZE)
14243        || (r != n && (r < 2 || (r & (r - 1)) != 0))
14244    {
14245        return -1;
14246    }
14247    let sample_count = if n == 0 {
14248        1
14249    } else {
14250        ((n - 1) / r + 1) as usize
14251    };
14252    if i.len() < sample_count {
14253        return -1;
14254    }
14255
14256    if n <= 1 {
14257        if i[0] != n {
14258            return -1;
14259        }
14260        if n == 1 {
14261            u[0] = t[0];
14262        }
14263        return 0;
14264    }
14265
14266    for sample in i.iter().take(sample_count) {
14267        let sample = *sample;
14268        if sample <= 0 || sample > n {
14269            return -1;
14270        }
14271    }
14272
14273    let threads = if threads > 0 { threads } else { 1 };
14274    let i_u32: Vec<SaUint> = i
14275        .iter()
14276        .take(sample_count)
14277        .map(|&sample| SaUint::try_from(sample).expect("sample was validated positive"))
14278        .collect();
14279    let mut p = vec![0u32; t_len + 1];
14280    let result = unbwt_main(t, u, &mut p, n, freq, r, &i_u32, threads);
14281    for idx in 0..t_len {
14282        a[idx] = p[idx] as SaSint;
14283    }
14284    result
14285}
14286
14287/// Internal helper: bwt copy 8u.
14288#[doc(hidden)]
14289pub fn bwt_copy_8u(u: &mut [u8], a: &[SaSint], n: SaSint) {
14290    if n <= 0 {
14291        return;
14292    }
14293
14294    let n_usize = usize::try_from(n).expect("n must be non-negative");
14295    for i in 0..n_usize {
14296        u[i] = a[i] as u8;
14297    }
14298}
14299
14300/// Internal helper: bwt copy 8u (OpenMP variant).
14301#[doc(hidden)]
14302pub fn bwt_copy_8u_omp(u: &mut [u8], a: &[SaSint], n: SaSint, threads: SaSint) {
14303    if threads == 1 || n < 65_536 {
14304        bwt_copy_8u(u, a, n);
14305        return;
14306    }
14307
14308    let n_usize = usize::try_from(n).expect("n must be non-negative");
14309    assert!(u.len() >= n_usize);
14310    assert!(a.len() >= n_usize);
14311    let threads_usize = usize::try_from(threads).expect("threads must be non-negative");
14312    let chunk_size = ((n_usize / threads_usize) & !15usize).max(16);
14313    let a_ptr = a.as_ptr() as usize;
14314    run_rayon_with_threads(threads_usize, || {
14315        u[..n_usize]
14316            .par_chunks_mut(chunk_size)
14317            .enumerate()
14318            .for_each(|(chunk_index, chunk)| {
14319                let start = chunk_index * chunk_size;
14320                let dst_ptr = chunk.as_mut_ptr();
14321                let src_ptr = unsafe { (a_ptr as *const SaSint).add(start) };
14322                for offset in 0..chunk.len() {
14323                    unsafe {
14324                        *dst_ptr.add(offset) = *src_ptr.add(offset) as u8;
14325                    }
14326                }
14327            });
14328    });
14329}
14330
14331/// Internal helper: accumulate counts s32 2.
14332#[doc(hidden)]
14333pub fn accumulate_counts_s32_2(bucket00: &mut [SaSint], bucket01: &[SaSint]) {
14334    assert_eq!(bucket00.len(), bucket01.len());
14335    for (dst, src) in bucket00.iter_mut().zip(bucket01.iter()) {
14336        *dst += *src;
14337    }
14338}
14339
14340/// Internal helper: accumulate counts s32 3.
14341#[doc(hidden)]
14342pub fn accumulate_counts_s32_3(bucket00: &mut [SaSint], bucket01: &[SaSint], bucket02: &[SaSint]) {
14343    assert_eq!(bucket00.len(), bucket01.len());
14344    assert_eq!(bucket00.len(), bucket02.len());
14345    for ((dst, src1), src2) in bucket00
14346        .iter_mut()
14347        .zip(bucket01.iter())
14348        .zip(bucket02.iter())
14349    {
14350        *dst += *src1 + *src2;
14351    }
14352}
14353
14354/// Internal helper: accumulate counts s32 4.
14355#[doc(hidden)]
14356pub fn accumulate_counts_s32_4(
14357    bucket00: &mut [SaSint],
14358    bucket01: &[SaSint],
14359    bucket02: &[SaSint],
14360    bucket03: &[SaSint],
14361) {
14362    assert_eq!(bucket00.len(), bucket01.len());
14363    assert_eq!(bucket00.len(), bucket02.len());
14364    assert_eq!(bucket00.len(), bucket03.len());
14365    for (((dst, src1), src2), src3) in bucket00
14366        .iter_mut()
14367        .zip(bucket01.iter())
14368        .zip(bucket02.iter())
14369        .zip(bucket03.iter())
14370    {
14371        *dst += *src1 + *src2 + *src3;
14372    }
14373}
14374
14375/// Internal helper: accumulate counts s32 5.
14376#[doc(hidden)]
14377pub fn accumulate_counts_s32_5(
14378    bucket00: &mut [SaSint],
14379    bucket01: &[SaSint],
14380    bucket02: &[SaSint],
14381    bucket03: &[SaSint],
14382    bucket04: &[SaSint],
14383) {
14384    assert_eq!(bucket00.len(), bucket01.len());
14385    assert_eq!(bucket00.len(), bucket02.len());
14386    assert_eq!(bucket00.len(), bucket03.len());
14387    assert_eq!(bucket00.len(), bucket04.len());
14388    for ((((dst, src1), src2), src3), src4) in bucket00
14389        .iter_mut()
14390        .zip(bucket01.iter())
14391        .zip(bucket02.iter())
14392        .zip(bucket03.iter())
14393        .zip(bucket04.iter())
14394    {
14395        *dst += *src1 + *src2 + *src3 + *src4;
14396    }
14397}
14398
14399/// Internal helper: accumulate counts s32 6.
14400#[doc(hidden)]
14401pub fn accumulate_counts_s32_6(
14402    bucket00: &mut [SaSint],
14403    bucket01: &[SaSint],
14404    bucket02: &[SaSint],
14405    bucket03: &[SaSint],
14406    bucket04: &[SaSint],
14407    bucket05: &[SaSint],
14408) {
14409    assert_eq!(bucket00.len(), bucket01.len());
14410    assert_eq!(bucket00.len(), bucket02.len());
14411    assert_eq!(bucket00.len(), bucket03.len());
14412    assert_eq!(bucket00.len(), bucket04.len());
14413    assert_eq!(bucket00.len(), bucket05.len());
14414    for (((((dst, src1), src2), src3), src4), src5) in bucket00
14415        .iter_mut()
14416        .zip(bucket01.iter())
14417        .zip(bucket02.iter())
14418        .zip(bucket03.iter())
14419        .zip(bucket04.iter())
14420        .zip(bucket05.iter())
14421    {
14422        *dst += *src1 + *src2 + *src3 + *src4 + *src5;
14423    }
14424}
14425
14426/// Internal helper: accumulate counts s32 7.
14427#[doc(hidden)]
14428pub fn accumulate_counts_s32_7(
14429    bucket00: &mut [SaSint],
14430    bucket01: &[SaSint],
14431    bucket02: &[SaSint],
14432    bucket03: &[SaSint],
14433    bucket04: &[SaSint],
14434    bucket05: &[SaSint],
14435    bucket06: &[SaSint],
14436) {
14437    assert_eq!(bucket00.len(), bucket01.len());
14438    assert_eq!(bucket00.len(), bucket02.len());
14439    assert_eq!(bucket00.len(), bucket03.len());
14440    assert_eq!(bucket00.len(), bucket04.len());
14441    assert_eq!(bucket00.len(), bucket05.len());
14442    assert_eq!(bucket00.len(), bucket06.len());
14443    for ((((((dst, src1), src2), src3), src4), src5), src6) in bucket00
14444        .iter_mut()
14445        .zip(bucket01.iter())
14446        .zip(bucket02.iter())
14447        .zip(bucket03.iter())
14448        .zip(bucket04.iter())
14449        .zip(bucket05.iter())
14450        .zip(bucket06.iter())
14451    {
14452        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6;
14453    }
14454}
14455
14456/// Internal helper: accumulate counts s32 8.
14457#[doc(hidden)]
14458pub fn accumulate_counts_s32_8(
14459    bucket00: &mut [SaSint],
14460    bucket01: &[SaSint],
14461    bucket02: &[SaSint],
14462    bucket03: &[SaSint],
14463    bucket04: &[SaSint],
14464    bucket05: &[SaSint],
14465    bucket06: &[SaSint],
14466    bucket07: &[SaSint],
14467) {
14468    assert_eq!(bucket00.len(), bucket01.len());
14469    assert_eq!(bucket00.len(), bucket02.len());
14470    assert_eq!(bucket00.len(), bucket03.len());
14471    assert_eq!(bucket00.len(), bucket04.len());
14472    assert_eq!(bucket00.len(), bucket05.len());
14473    assert_eq!(bucket00.len(), bucket06.len());
14474    assert_eq!(bucket00.len(), bucket07.len());
14475    for (((((((dst, src1), src2), src3), src4), src5), src6), src7) in bucket00
14476        .iter_mut()
14477        .zip(bucket01.iter())
14478        .zip(bucket02.iter())
14479        .zip(bucket03.iter())
14480        .zip(bucket04.iter())
14481        .zip(bucket05.iter())
14482        .zip(bucket06.iter())
14483        .zip(bucket07.iter())
14484    {
14485        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6 + *src7;
14486    }
14487}
14488
14489/// Internal helper: accumulate counts s32 9.
14490#[doc(hidden)]
14491pub fn accumulate_counts_s32_9(
14492    bucket00: &mut [SaSint],
14493    bucket01: &[SaSint],
14494    bucket02: &[SaSint],
14495    bucket03: &[SaSint],
14496    bucket04: &[SaSint],
14497    bucket05: &[SaSint],
14498    bucket06: &[SaSint],
14499    bucket07: &[SaSint],
14500    bucket08: &[SaSint],
14501) {
14502    assert_eq!(bucket00.len(), bucket01.len());
14503    assert_eq!(bucket00.len(), bucket02.len());
14504    assert_eq!(bucket00.len(), bucket03.len());
14505    assert_eq!(bucket00.len(), bucket04.len());
14506    assert_eq!(bucket00.len(), bucket05.len());
14507    assert_eq!(bucket00.len(), bucket06.len());
14508    assert_eq!(bucket00.len(), bucket07.len());
14509    assert_eq!(bucket00.len(), bucket08.len());
14510    for ((((((((dst, src1), src2), src3), src4), src5), src6), src7), src8) in bucket00
14511        .iter_mut()
14512        .zip(bucket01.iter())
14513        .zip(bucket02.iter())
14514        .zip(bucket03.iter())
14515        .zip(bucket04.iter())
14516        .zip(bucket05.iter())
14517        .zip(bucket06.iter())
14518        .zip(bucket07.iter())
14519        .zip(bucket08.iter())
14520    {
14521        *dst += *src1 + *src2 + *src3 + *src4 + *src5 + *src6 + *src7 + *src8;
14522    }
14523}
14524
14525/// Internal helper: accumulate counts s32.
14526#[doc(hidden)]
14527pub fn accumulate_counts_s32(
14528    buckets: &mut [SaSint],
14529    bucket_size: FastSint,
14530    bucket_stride: FastSint,
14531    mut num_buckets: FastSint,
14532) {
14533    if num_buckets <= 1 {
14534        return;
14535    }
14536
14537    let bucket_size = usize::try_from(bucket_size).expect("bucket_size must be non-negative");
14538    let bucket_stride = usize::try_from(bucket_stride).expect("bucket_stride must be non-negative");
14539    let num_buckets_usize = usize::try_from(num_buckets).expect("num_buckets must be non-negative");
14540    assert!(buckets.len() >= bucket_size + (num_buckets_usize - 1) * bucket_stride);
14541    let bucket00_start = (num_buckets_usize - 1) * bucket_stride;
14542
14543    while num_buckets >= 9 {
14544        let start = bucket00_start
14545            - usize::try_from(num_buckets - 9).expect("non-negative") * bucket_stride;
14546        accumulate_counts_at(buckets, start, bucket_size, bucket_stride, 9);
14547        num_buckets -= 8;
14548    }
14549
14550    match num_buckets {
14551        1 => {}
14552        2..=8 => accumulate_counts_at(
14553            buckets,
14554            bucket00_start,
14555            bucket_size,
14556            bucket_stride,
14557            usize::try_from(num_buckets).expect("non-negative"),
14558        ),
14559        _ => {}
14560    }
14561}
14562
14563fn block_slice<T>(slice: &[T], block_start: FastSint, block_size: FastSint) -> &[T] {
14564    let start = usize::try_from(block_start).expect("block_start must be non-negative");
14565    let len = usize::try_from(block_size).expect("block_size must be non-negative");
14566    &slice[start..start + len]
14567}
14568
14569#[allow(dead_code)]
14570struct SharedMutArray<'a> {
14571    ptr: *mut SaSint,
14572    len: usize,
14573    _marker: PhantomData<&'a mut [SaSint]>,
14574}
14575
14576#[allow(dead_code)]
14577impl<'a> SharedMutArray<'a> {
14578    fn new(slice: &'a mut [SaSint]) -> Self {
14579        Self {
14580            ptr: slice.as_mut_ptr(),
14581            len: slice.len(),
14582            _marker: PhantomData,
14583        }
14584    }
14585
14586    fn len(&self) -> usize {
14587        self.len
14588    }
14589
14590    fn slice_mut(&mut self, start: usize, len: usize) -> &mut [SaSint] {
14591        assert!(start <= self.len);
14592        assert!(len <= self.len - start);
14593        unsafe {
14594            // The recursive driver aliases multiple logical views into one SA backing store.
14595            // This helper centralizes that checked projection so the driver can be translated
14596            // without pretending those regions are independent Rust slices.
14597            std::slice::from_raw_parts_mut(self.ptr.add(start), len)
14598        }
14599    }
14600}
14601
14602fn accumulate_counts_at(
14603    buckets: &mut [SaSint],
14604    bucket00_start: usize,
14605    bucket_size: usize,
14606    bucket_stride: usize,
14607    count: usize,
14608) {
14609    assert!((2..=9).contains(&count));
14610    assert!(bucket00_start >= (count - 1) * bucket_stride);
14611
14612    let dst_end = bucket00_start + bucket_size;
14613    let mut sums = vec![0; bucket_size];
14614
14615    for i in 0..count {
14616        let start = bucket00_start - i * bucket_stride;
14617        let end = start + bucket_size;
14618        for (sum, value) in sums.iter_mut().zip(buckets[start..end].iter()) {
14619            *sum += *value;
14620        }
14621    }
14622
14623    buckets[bucket00_start..dst_end].copy_from_slice(&sums);
14624}
14625
14626/// Internal helper: thread state size.
14627#[doc(hidden)]
14628pub fn thread_state_size() -> usize {
14629    mem::size_of::<ThreadState>()
14630}
14631
14632#[cfg(all(test, feature = "upstream-c"))]
14633mod tests {
14634    use super::*;
14635
14636    unsafe extern "C" {
14637        fn probe_renumber_lms_suffixes_8u(
14638            sa: *mut SaSint,
14639            m: SaSint,
14640            name: SaSint,
14641            omp_block_start: FastSint,
14642            omp_block_size: FastSint,
14643        ) -> SaSint;
14644
14645        fn probe_gather_marked_lms_suffixes(
14646            sa: *mut SaSint,
14647            m: SaSint,
14648            l: FastSint,
14649            omp_block_start: FastSint,
14650            omp_block_size: FastSint,
14651        ) -> FastSint;
14652
14653        fn probe_renumber_distinct_lms_suffixes_32s_4k(
14654            sa: *mut SaSint,
14655            m: SaSint,
14656            name: SaSint,
14657            omp_block_start: FastSint,
14658            omp_block_size: FastSint,
14659        ) -> SaSint;
14660
14661        fn probe_renumber_unique_and_nonunique_lms_suffixes_32s(
14662            t: *mut SaSint,
14663            sa: *mut SaSint,
14664            m: SaSint,
14665            f: SaSint,
14666            omp_block_start: FastSint,
14667            omp_block_size: FastSint,
14668        ) -> SaSint;
14669
14670        fn probe_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
14671            t: *mut SaSint,
14672            sa: *mut SaSint,
14673            m: SaSint,
14674            threads: SaSint,
14675        ) -> SaSint;
14676
14677        fn probe_renumber_and_gather_lms_suffixes_omp(
14678            sa: *mut SaSint,
14679            n: SaSint,
14680            m: SaSint,
14681            fs: SaSint,
14682            threads: SaSint,
14683        ) -> SaSint;
14684
14685        fn probe_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
14686            sa: *mut SaSint,
14687            n: SaSint,
14688            m: SaSint,
14689            threads: SaSint,
14690        ) -> SaSint;
14691
14692        fn probe_main_32s_entry(
14693            t: *mut SaSint,
14694            sa: *mut SaSint,
14695            n: SaSint,
14696            k: SaSint,
14697            fs: SaSint,
14698            threads: SaSint,
14699        ) -> SaSint;
14700
14701        fn probe_public_libsais_freq(
14702            t: *const u8,
14703            sa: *mut SaSint,
14704            n: SaSint,
14705            fs: SaSint,
14706            freq: *mut SaSint,
14707        ) -> SaSint;
14708
14709        fn probe_public_libsais_gsa_freq(
14710            t: *const u8,
14711            sa: *mut SaSint,
14712            n: SaSint,
14713            fs: SaSint,
14714            freq: *mut SaSint,
14715        ) -> SaSint;
14716
14717        fn probe_public_libsais_bwt_freq(
14718            t: *const u8,
14719            u: *mut u8,
14720            a: *mut SaSint,
14721            n: SaSint,
14722            fs: SaSint,
14723            freq: *mut SaSint,
14724        ) -> SaSint;
14725
14726        fn probe_public_libsais_bwt_aux_freq(
14727            t: *const u8,
14728            u: *mut u8,
14729            a: *mut SaSint,
14730            n: SaSint,
14731            fs: SaSint,
14732            freq: *mut SaSint,
14733            r: SaSint,
14734            i: *mut SaSint,
14735        ) -> SaSint;
14736
14737        fn probe_public_libsais_unbwt_freq(
14738            t: *const u8,
14739            u: *mut u8,
14740            a: *mut SaSint,
14741            n: SaSint,
14742            freq: *const SaSint,
14743            i: SaSint,
14744        ) -> SaSint;
14745
14746        fn probe_public_libsais_unbwt_aux_freq(
14747            t: *const u8,
14748            u: *mut u8,
14749            a: *mut SaSint,
14750            n: SaSint,
14751            freq: *const SaSint,
14752            r: SaSint,
14753            i: *const SaSint,
14754        ) -> SaSint;
14755    }
14756
14757    fn make_recursive_main_32s_text(repeats: usize) -> Vec<SaSint> {
14758        let motif = [9, 4, 9, 2, 9, 4, 9, 1];
14759        let mut t = Vec::with_capacity(repeats * motif.len() + 1);
14760        for _ in 0..repeats {
14761            t.extend_from_slice(&motif);
14762        }
14763        t.push(0);
14764        t
14765    }
14766
14767    fn make_large_main_32s_stress_text(len: usize, alphabet: SaSint) -> Vec<SaSint> {
14768        let mut state: u32 = 0x1357_9bdf;
14769        let mut t = Vec::with_capacity(len + 1);
14770
14771        for i in 0..len {
14772            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
14773            let mut value = ((state >> 16) % (alphabet as u32 - 1)) as SaSint + 1;
14774
14775            if i % 17 < 8 {
14776                value = ((i / 17) as SaSint % 11) + 1;
14777            }
14778            if i % 29 < 10 {
14779                value = (((i / 29) as SaSint * 3) % 19) + 1;
14780            }
14781            if i % 64 >= 48 {
14782                value = t[i - 48];
14783            }
14784
14785            t.push(value);
14786        }
14787
14788        t.push(0);
14789        t
14790    }
14791
14792    fn assert_main_32s_entry_matches_upstream_c(
14793        t: Vec<SaSint>,
14794        k: SaSint,
14795        fs: SaSint,
14796        compare_full_sa: bool,
14797    ) {
14798        let mut t = t;
14799        let n = t.len() as SaSint;
14800        let n_usize = t.len();
14801        let threads = 1;
14802        let extra = usize::try_from(fs).expect("fs must be non-negative");
14803        let mut sa = vec![0; t.len() + extra];
14804
14805        let initial_t = t.clone();
14806        let initial_sa = sa.clone();
14807
14808        let c_result =
14809            unsafe { probe_main_32s_entry(t.as_mut_ptr(), sa.as_mut_ptr(), n, k, fs, threads) };
14810        let c_t = t.clone();
14811        let c_sa = sa.clone();
14812
14813        t.copy_from_slice(&initial_t);
14814        sa.copy_from_slice(&initial_sa);
14815
14816        let mut thread_state = alloc_thread_state(threads).expect("thread state");
14817        let rust_result =
14818            libsais_main_32s_entry(&mut t, &mut sa, n, k, fs, threads, &mut thread_state);
14819
14820        assert_eq!(rust_result, c_result);
14821        assert_slice_eq_with_first_diff("T", &t, &c_t);
14822        if compare_full_sa {
14823            assert_slice_eq_with_first_diff("SA", &sa, &c_sa);
14824        } else {
14825            assert_slice_eq_with_first_diff("SA", &sa[..n_usize], &c_sa[..n_usize]);
14826        }
14827    }
14828
14829    fn assert_main_32s_entry_matches_upstream_c_for_branch(k: SaSint) {
14830        assert_main_32s_entry_matches_upstream_c(
14831            vec![17, 3, 17, 9, 5, 9, 2, 11, 2, 7, 1, 7, 0],
14832            k,
14833            0,
14834            true,
14835        );
14836    }
14837
14838    fn assert_slice_eq_with_first_diff(label: &str, left: &[SaSint], right: &[SaSint]) {
14839        assert_eq!(left.len(), right.len(), "{label} length mismatch");
14840        if let Some((idx, (l, r))) = left
14841            .iter()
14842            .zip(right.iter())
14843            .enumerate()
14844            .find(|(_, (l, r))| l != r)
14845        {
14846            panic!("{label} first diff at index {idx}: rust={l}, c={r}");
14847        }
14848    }
14849
14850    #[test]
14851    fn align_up_matches_power_of_two_alignment() {
14852        assert_eq!(align_up(0, 4096), 0);
14853        assert_eq!(align_up(1, 4096), 4096);
14854        assert_eq!(align_up(4095, 4096), 4096);
14855        assert_eq!(align_up(4096, 4096), 4096);
14856        assert_eq!(align_up(4097, 4096), 8192);
14857        assert_eq!(align_up(65, 64), 128);
14858    }
14859
14860    #[test]
14861    fn shared_mut_array_projects_mutable_spans_from_one_backing_buffer() {
14862        let mut backing = vec![1, 2, 3, 4, 5, 6];
14863        let len;
14864        {
14865            let mut shared = SharedMutArray::new(&mut backing);
14866            shared.slice_mut(1, 3).copy_from_slice(&[20, 30, 40]);
14867            shared.slice_mut(4, 2).copy_from_slice(&[50, 60]);
14868            len = shared.len();
14869        }
14870        assert_eq!(backing, vec![1, 20, 30, 40, 50, 60]);
14871        assert_eq!(len, 6);
14872    }
14873
14874    #[test]
14875    fn create_ctx_main_matches_single_thread_layout() {
14876        let ctx = create_ctx_main(1).expect("context");
14877        assert_eq!(ctx.buckets.len(), 8 * ALPHABET_SIZE);
14878        assert_eq!(ctx.threads, 1);
14879        assert!(ctx.thread_state.is_none());
14880    }
14881
14882    #[test]
14883    fn create_ctx_main_allocates_thread_state_for_multi_threaded_mode() {
14884        let ctx = create_ctx_main(3).expect("context");
14885        let states = ctx.thread_state.expect("thread state");
14886        assert_eq!(states.len(), 3);
14887        assert!(states
14888            .iter()
14889            .all(|state| state.buckets.len() == 4 * ALPHABET_SIZE));
14890        assert!(states
14891            .iter()
14892            .all(|state| state.cache.len() == LIBSAIS_PER_THREAD_CACHE_SIZE));
14893    }
14894
14895    #[test]
14896    fn create_ctx_wraps_single_thread_main_context() {
14897        let ctx = create_ctx().expect("context");
14898        assert_eq!(ctx.threads, 1);
14899        assert_eq!(ctx.buckets.len(), 8 * ALPHABET_SIZE);
14900        assert!(ctx.thread_state.is_none());
14901    }
14902
14903    #[test]
14904    fn free_ctx_accepts_context_value() {
14905        let ctx = create_ctx().expect("context");
14906        free_ctx(ctx);
14907    }
14908
14909    fn brute_force_suffix_array_u8(t: &[u8]) -> Vec<SaSint> {
14910        let mut sa: Vec<SaSint> = (0..t.len())
14911            .map(|index| SaSint::try_from(index).expect("index must fit SaSint"))
14912            .collect();
14913        sa.sort_by(|&lhs, &rhs| {
14914            t[usize::try_from(lhs).expect("non-negative")..]
14915                .cmp(&t[usize::try_from(rhs).expect("non-negative")..])
14916        });
14917        sa
14918    }
14919
14920    fn brute_force_plcp_u8(t: &[u8], sa: &[SaSint]) -> Vec<SaSint> {
14921        let mut rank = vec![0usize; t.len()];
14922        for (i, &suffix) in sa.iter().enumerate() {
14923            rank[usize::try_from(suffix).expect("suffix index must be non-negative")] = i;
14924        }
14925
14926        let mut plcp = vec![0; t.len()];
14927        for i in 0..t.len() {
14928            let r = rank[i];
14929            let prev = if r == 0 {
14930                t.len()
14931            } else {
14932                usize::try_from(sa[r - 1]).expect("suffix index must be non-negative")
14933            };
14934            if prev == t.len() {
14935                plcp[i] = 0;
14936                continue;
14937            }
14938
14939            let mut l = 0usize;
14940            while i + l < t.len() && prev + l < t.len() && t[i + l] == t[prev + l] {
14941                l += 1;
14942            }
14943            plcp[i] = l as SaSint;
14944        }
14945        plcp
14946    }
14947
14948    fn brute_force_lcp_from_sa_u8(t: &[u8], sa: &[SaSint]) -> Vec<SaSint> {
14949        let mut lcp = vec![0; sa.len()];
14950        for i in 0..sa.len() {
14951            let lhs = usize::try_from(sa[i]).expect("suffix index must be non-negative");
14952            let rhs = if i == 0 {
14953                sa.len()
14954            } else {
14955                usize::try_from(sa[i - 1]).expect("suffix index must be non-negative")
14956            };
14957            if rhs == sa.len() {
14958                lcp[i] = 0;
14959                continue;
14960            }
14961
14962            let mut l = 0usize;
14963            while lhs + l < t.len() && rhs + l < t.len() && t[lhs + l] == t[rhs + l] {
14964                l += 1;
14965            }
14966            lcp[i] = l as SaSint;
14967        }
14968        lcp
14969    }
14970
14971    #[test]
14972    fn libsais_matches_bruteforce_suffix_array_for_small_text() {
14973        let t = b"banana";
14974        let mut sa = vec![0; t.len()];
14975        let mut freq = vec![0; ALPHABET_SIZE];
14976
14977        let result = libsais(t, &mut sa, 0, Some(&mut freq));
14978
14979        assert_eq!(result, 0);
14980        assert_eq!(sa, brute_force_suffix_array_u8(t));
14981        assert_eq!(freq[b'a' as usize], 3);
14982        assert_eq!(freq[b'b' as usize], 1);
14983        assert_eq!(freq[b'n' as usize], 2);
14984    }
14985
14986    #[test]
14987    fn public_libsais_frequency_outputs_match_upstream_c() {
14988        let text = b"banana";
14989        let gsa_text = b"ban\0ana\0";
14990        let mut rust_sa = vec![0; text.len()];
14991        let mut c_sa = vec![0; text.len()];
14992        let mut rust_freq = vec![-1; ALPHABET_SIZE];
14993        let mut c_freq = vec![-1; ALPHABET_SIZE];
14994
14995        let rust_rc = libsais(text, &mut rust_sa, 0, Some(&mut rust_freq));
14996        let c_rc = unsafe {
14997            probe_public_libsais_freq(
14998                text.as_ptr(),
14999                c_sa.as_mut_ptr(),
15000                text.len() as SaSint,
15001                0,
15002                c_freq.as_mut_ptr(),
15003            )
15004        };
15005        assert_eq!(rust_rc, c_rc);
15006        assert_eq!(rust_sa, c_sa);
15007        assert_eq!(rust_freq, c_freq);
15008
15009        let mut rust_gsa = vec![0; gsa_text.len()];
15010        let mut c_gsa = vec![0; gsa_text.len()];
15011        rust_freq.fill(-1);
15012        c_freq.fill(-1);
15013        let rust_rc = libsais_gsa(gsa_text, &mut rust_gsa, 0, Some(&mut rust_freq));
15014        let c_rc = unsafe {
15015            probe_public_libsais_gsa_freq(
15016                gsa_text.as_ptr(),
15017                c_gsa.as_mut_ptr(),
15018                gsa_text.len() as SaSint,
15019                0,
15020                c_freq.as_mut_ptr(),
15021            )
15022        };
15023        assert_eq!(rust_rc, c_rc);
15024        assert_eq!(rust_gsa, c_gsa);
15025        assert_eq!(rust_freq, c_freq);
15026
15027        let mut rust_u = vec![0; text.len()];
15028        let mut rust_a = vec![0; text.len()];
15029        let mut c_u = vec![0; text.len()];
15030        let mut c_a = vec![0; text.len()];
15031        rust_freq.fill(-1);
15032        c_freq.fill(-1);
15033        let rust_rc = libsais_bwt(text, &mut rust_u, &mut rust_a, 0, Some(&mut rust_freq));
15034        let c_rc = unsafe {
15035            probe_public_libsais_bwt_freq(
15036                text.as_ptr(),
15037                c_u.as_mut_ptr(),
15038                c_a.as_mut_ptr(),
15039                text.len() as SaSint,
15040                0,
15041                c_freq.as_mut_ptr(),
15042            )
15043        };
15044        assert_eq!(rust_rc, c_rc);
15045        assert_eq!(rust_u, c_u);
15046        assert_eq!(rust_freq, c_freq);
15047
15048        let r = 4;
15049        let mut rust_i = vec![0; (text.len() - 1) / r as usize + 1];
15050        let mut c_i = vec![0; rust_i.len()];
15051        rust_freq.fill(-1);
15052        c_freq.fill(-1);
15053        let rust_rc = libsais_bwt_aux(
15054            text,
15055            &mut rust_u,
15056            &mut rust_a,
15057            0,
15058            Some(&mut rust_freq),
15059            r,
15060            &mut rust_i,
15061        );
15062        let c_rc = unsafe {
15063            probe_public_libsais_bwt_aux_freq(
15064                text.as_ptr(),
15065                c_u.as_mut_ptr(),
15066                c_a.as_mut_ptr(),
15067                text.len() as SaSint,
15068                0,
15069                c_freq.as_mut_ptr(),
15070                r,
15071                c_i.as_mut_ptr(),
15072            )
15073        };
15074        assert_eq!(rust_rc, c_rc);
15075        assert_eq!(rust_u, c_u);
15076        assert_eq!(rust_i, c_i);
15077        assert_eq!(rust_freq, c_freq);
15078    }
15079
15080    #[test]
15081    fn public_libsais_unbwt_with_frequency_matches_upstream_c() {
15082        let text = b"abracadabra";
15083        let mut freq = vec![0; ALPHABET_SIZE];
15084        let mut bwt = vec![0; text.len()];
15085        let mut work = vec![0; text.len()];
15086        let primary = libsais_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
15087        assert!(primary >= 0);
15088
15089        let mut rust_u = vec![0; text.len()];
15090        let mut rust_a = vec![0; text.len() + 1];
15091        let mut c_u = vec![0; text.len()];
15092        let mut c_a = vec![0; text.len() + 1];
15093        let rust_rc = libsais_unbwt(&bwt, &mut rust_u, &mut rust_a, Some(&freq), primary);
15094        let c_rc = unsafe {
15095            probe_public_libsais_unbwt_freq(
15096                bwt.as_ptr(),
15097                c_u.as_mut_ptr(),
15098                c_a.as_mut_ptr(),
15099                bwt.len() as SaSint,
15100                freq.as_ptr(),
15101                primary,
15102            )
15103        };
15104        assert_eq!(rust_rc, c_rc);
15105        assert_eq!(rust_u, c_u);
15106        assert_eq!(rust_u, text);
15107
15108        let r = 4;
15109        let mut aux = vec![0; (text.len() - 1) / r as usize + 1];
15110        let bwt_rc = libsais_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), r, &mut aux);
15111        assert_eq!(bwt_rc, 0);
15112
15113        rust_u.fill(0);
15114        rust_a.fill(0);
15115        c_u.fill(0);
15116        c_a.fill(0);
15117        let rust_rc = libsais_unbwt_aux(&bwt, &mut rust_u, &mut rust_a, Some(&freq), r, &aux);
15118        let c_rc = unsafe {
15119            probe_public_libsais_unbwt_aux_freq(
15120                bwt.as_ptr(),
15121                c_u.as_mut_ptr(),
15122                c_a.as_mut_ptr(),
15123                bwt.len() as SaSint,
15124                freq.as_ptr(),
15125                r,
15126                aux.as_ptr(),
15127            )
15128        };
15129        assert_eq!(rust_rc, c_rc);
15130        assert_eq!(rust_u, c_u);
15131        assert_eq!(rust_u, text);
15132    }
15133
15134    #[test]
15135    fn libsais_omp_frequency_wrappers_match_direct_calls() {
15136        let text = b"banana";
15137        let gsa_text = b"ban\0ana\0";
15138
15139        let mut direct_sa = vec![0; text.len()];
15140        let mut omp_sa = vec![0; text.len()];
15141        let mut direct_freq = vec![-1; ALPHABET_SIZE];
15142        let mut omp_freq = vec![-1; ALPHABET_SIZE];
15143        assert_eq!(libsais(text, &mut direct_sa, 0, Some(&mut direct_freq)), 0);
15144        assert_eq!(libsais_omp(text, &mut omp_sa, 0, Some(&mut omp_freq), 2), 0);
15145        assert_eq!(omp_sa, direct_sa);
15146        assert_eq!(omp_freq, direct_freq);
15147
15148        let mut direct_gsa = vec![0; gsa_text.len()];
15149        let mut omp_gsa = vec![0; gsa_text.len()];
15150        direct_freq.fill(-1);
15151        omp_freq.fill(-1);
15152        assert_eq!(
15153            libsais_gsa(gsa_text, &mut direct_gsa, 0, Some(&mut direct_freq)),
15154            0
15155        );
15156        assert_eq!(
15157            libsais_gsa_omp(gsa_text, &mut omp_gsa, 0, Some(&mut omp_freq), 2),
15158            0
15159        );
15160        assert_eq!(omp_gsa, direct_gsa);
15161        assert_eq!(omp_freq, direct_freq);
15162
15163        let mut direct_bwt = vec![0; text.len()];
15164        let mut direct_work = vec![0; text.len()];
15165        let mut omp_bwt = vec![0; text.len()];
15166        let mut omp_work = vec![0; text.len()];
15167        direct_freq.fill(-1);
15168        omp_freq.fill(-1);
15169        assert_eq!(
15170            libsais_bwt(
15171                text,
15172                &mut direct_bwt,
15173                &mut direct_work,
15174                0,
15175                Some(&mut direct_freq)
15176            ),
15177            libsais_bwt_omp(text, &mut omp_bwt, &mut omp_work, 0, Some(&mut omp_freq), 2)
15178        );
15179        assert_eq!(omp_bwt, direct_bwt);
15180        assert_eq!(omp_freq, direct_freq);
15181
15182        let mut direct_aux = vec![0; 2];
15183        let mut omp_aux = vec![0; 2];
15184        direct_freq.fill(-1);
15185        omp_freq.fill(-1);
15186        assert_eq!(
15187            libsais_bwt_aux(
15188                text,
15189                &mut direct_bwt,
15190                &mut direct_work,
15191                0,
15192                Some(&mut direct_freq),
15193                4,
15194                &mut direct_aux
15195            ),
15196            libsais_bwt_aux_omp(
15197                text,
15198                &mut omp_bwt,
15199                &mut omp_work,
15200                0,
15201                Some(&mut omp_freq),
15202                4,
15203                &mut omp_aux,
15204                2
15205            )
15206        );
15207        assert_eq!(omp_bwt, direct_bwt);
15208        assert_eq!(omp_aux, direct_aux);
15209        assert_eq!(omp_freq, direct_freq);
15210    }
15211
15212    #[test]
15213    #[ignore = "large real-data regression; requires local minibwa yeast fixture"]
15214    fn public_libsais_omp_handles_minibwa_yeast_two_strand_index_input() {
15215        let l2b_path =
15216            "/data/henriksson/github/claude/minibwa/.tmp/compare-yeast-now/ref.split.rust.l2b";
15217        let fasta_path =
15218            "/data/henriksson/github/claude/minibwa/.tmp/large-real/yeast/ref.sanitized.fa";
15219        let forward = if let Ok(bytes) = std::fs::read(l2b_path) {
15220            assert!(bytes.len() >= 64, "short l2b fixture: {l2b_path}");
15221            assert_eq!(&bytes[..4], b"L2B\x01", "bad l2b magic in {l2b_path}");
15222            let n_ctg = u64::from_le_bytes(bytes[8..16].try_into().unwrap()) as usize;
15223            let tot_len = u64::from_le_bytes(bytes[16..24].try_into().unwrap()) as usize;
15224            let n_ambi = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize;
15225            let n_mask = u64::from_le_bytes(bytes[32..40].try_into().unwrap()) as usize;
15226            let n_pac = u64::from_le_bytes(bytes[56..64].try_into().unwrap()) as usize;
15227            let pac_start = 64 + 8 * n_ctg + 16 * n_ambi + 16 * n_mask;
15228            assert!(
15229                bytes.len() >= pac_start + 8 * n_pac,
15230                "truncated l2b pac in {l2b_path}"
15231            );
15232            let mut pac = Vec::with_capacity(n_pac);
15233            for chunk in bytes[pac_start..pac_start + 8 * n_pac].chunks_exact(8) {
15234                pac.push(u64::from_le_bytes(chunk.try_into().unwrap()));
15235            }
15236            (0..tot_len)
15237                .map(|i| ((pac[i >> 5] >> ((i & 31) << 1)) & 3) as u8)
15238                .collect::<Vec<_>>()
15239        } else if let Ok(fasta) = std::fs::read_to_string(fasta_path) {
15240            let mut rng = 11u64;
15241            let mut forward = Vec::new();
15242            for line in fasta.lines() {
15243                if line.starts_with('>') {
15244                    continue;
15245                }
15246                forward.extend(line.bytes().map(|b| {
15247                    let mut c = match b {
15248                        b'A' | b'a' => 0,
15249                        b'C' | b'c' => 1,
15250                        b'G' | b'g' => 2,
15251                        b'T' | b't' | b'U' | b'u' => 3,
15252                        _ => {
15253                            rng = rng.wrapping_add(0x9e3779b97f4a7c15);
15254                            let mut z = rng;
15255                            z = (z ^ (z >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
15256                            z = (z ^ (z >> 27)).wrapping_mul(0x94d049bb133111eb);
15257                            4 | ((z ^ (z >> 31)) & 3) as u8
15258                        }
15259                    };
15260                    if b < b'A' || b > b'Z' {
15261                        c |= 1 << 3;
15262                    }
15263                    c & 3
15264                }));
15265            }
15266            forward
15267        } else {
15268            eprintln!("skipping missing fixtures: {l2b_path} and {fasta_path}");
15269            return;
15270        };
15271        assert!(
15272            forward.len() > 12_000_000,
15273            "fixture should exercise the minibwa yeast index workload"
15274        );
15275
15276        let mut text = Vec::with_capacity(forward.len() * 2);
15277        text.extend_from_slice(&forward);
15278        text.extend(forward.iter().rev().map(|&c| 3 - c));
15279
15280        const FS: SaSint = 10_000;
15281        let mut sa = vec![0; text.len() + FS as usize + 1];
15282        assert_eq!(libsais_omp(&text, &mut sa[1..], FS, None, 4), 0);
15283        if let Some((i, &value)) = sa[1..1 + text.len()]
15284            .iter()
15285            .enumerate()
15286            .find(|&(_, &value)| value < 0 || value as usize >= text.len())
15287        {
15288            panic!("invalid suffix-array entry at {i}: {value}");
15289        }
15290    }
15291
15292    #[test]
15293    #[ignore = "large real-data regression; requires local minibwa yeast fixture"]
15294    fn public_libsais_omp_matches_plain_on_minibwa_yeast_two_strand_index_input() {
15295        let l2b_path =
15296            "/data/henriksson/github/claude/minibwa/.tmp/compare-yeast-now/ref.split.rust.l2b";
15297        let fasta_path =
15298            "/data/henriksson/github/claude/minibwa/.tmp/large-real/yeast/ref.sanitized.fa";
15299        let forward = if let Ok(bytes) = std::fs::read(l2b_path) {
15300            assert!(bytes.len() >= 64, "short l2b fixture: {l2b_path}");
15301            assert_eq!(&bytes[..4], b"L2B\x01", "bad l2b magic in {l2b_path}");
15302            let n_ctg = u64::from_le_bytes(bytes[8..16].try_into().unwrap()) as usize;
15303            let tot_len = u64::from_le_bytes(bytes[16..24].try_into().unwrap()) as usize;
15304            let n_ambi = u64::from_le_bytes(bytes[24..32].try_into().unwrap()) as usize;
15305            let n_mask = u64::from_le_bytes(bytes[32..40].try_into().unwrap()) as usize;
15306            let n_pac = u64::from_le_bytes(bytes[56..64].try_into().unwrap()) as usize;
15307            let pac_start = 64 + 8 * n_ctg + 16 * n_ambi + 16 * n_mask;
15308            assert!(
15309                bytes.len() >= pac_start + 8 * n_pac,
15310                "truncated l2b pac in {l2b_path}"
15311            );
15312            let mut pac = Vec::with_capacity(n_pac);
15313            for chunk in bytes[pac_start..pac_start + 8 * n_pac].chunks_exact(8) {
15314                pac.push(u64::from_le_bytes(chunk.try_into().unwrap()));
15315            }
15316            (0..tot_len)
15317                .map(|i| ((pac[i >> 5] >> ((i & 31) << 1)) & 3) as u8)
15318                .collect::<Vec<_>>()
15319        } else if let Ok(fasta) = std::fs::read_to_string(fasta_path) {
15320            let mut rng = 11u64;
15321            let mut forward = Vec::new();
15322            for line in fasta.lines() {
15323                if line.starts_with('>') {
15324                    continue;
15325                }
15326                forward.extend(line.bytes().map(|b| {
15327                    let mut c = match b {
15328                        b'A' | b'a' => 0,
15329                        b'C' | b'c' => 1,
15330                        b'G' | b'g' => 2,
15331                        b'T' | b't' | b'U' | b'u' => 3,
15332                        _ => {
15333                            rng = rng.wrapping_add(0x9e3779b97f4a7c15);
15334                            let mut z = rng;
15335                            z = (z ^ (z >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
15336                            z = (z ^ (z >> 27)).wrapping_mul(0x94d049bb133111eb);
15337                            4 | ((z ^ (z >> 31)) & 3) as u8
15338                        }
15339                    };
15340                    if b < b'A' || b > b'Z' {
15341                        c |= 1 << 3;
15342                    }
15343                    c & 3
15344                }));
15345            }
15346            forward
15347        } else {
15348            eprintln!("skipping missing fixtures: {l2b_path} and {fasta_path}");
15349            return;
15350        };
15351        assert!(
15352            forward.len() > 12_000_000,
15353            "fixture should exercise the minibwa yeast index workload"
15354        );
15355
15356        let mut text = Vec::with_capacity(forward.len() * 2);
15357        text.extend_from_slice(&forward);
15358        text.extend(forward.iter().rev().map(|&c| 3 - c));
15359
15360        const FS: SaSint = 10_000;
15361        let mut plain_sa = vec![0; text.len() + FS as usize + 1];
15362        let mut omp_sa = vec![0; text.len() + FS as usize + 1];
15363        assert_eq!(libsais(&text, &mut plain_sa[1..], FS, None), 0);
15364        assert_eq!(libsais_omp(&text, &mut omp_sa[1..], FS, None, 4), 0);
15365        plain_sa[0] = text.len() as SaSint;
15366        omp_sa[0] = text.len() as SaSint;
15367        if let Some(i) = plain_sa[..=text.len()]
15368            .iter()
15369            .zip(&omp_sa[..=text.len()])
15370            .position(|(plain, omp)| plain != omp)
15371        {
15372            panic!(
15373                "first suffix-array diff at {i}: plain={} omp={}",
15374                plain_sa[i], omp_sa[i]
15375            );
15376        }
15377    }
15378
15379    #[test]
15380    fn libsais_unbwt_omp_frequency_wrappers_match_direct_calls() {
15381        let text = b"abracadabra";
15382        let mut freq = vec![0; ALPHABET_SIZE];
15383        let mut bwt = vec![0; text.len()];
15384        let mut work = vec![0; text.len()];
15385        let primary = libsais_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
15386        assert!(primary >= 0);
15387
15388        let mut direct = vec![0; text.len()];
15389        let mut direct_work = vec![0; text.len() + 1];
15390        let mut omp = vec![0; text.len()];
15391        let mut omp_work = vec![0; text.len() + 1];
15392        assert_eq!(
15393            libsais_unbwt(&bwt, &mut direct, &mut direct_work, Some(&freq), primary),
15394            libsais_unbwt_omp(&bwt, &mut omp, &mut omp_work, Some(&freq), primary, 2)
15395        );
15396        assert_eq!(omp, direct);
15397        assert_eq!(omp, text);
15398
15399        let mut aux = vec![0; (text.len() - 1) / 4 + 1];
15400        assert_eq!(
15401            libsais_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), 4, &mut aux),
15402            0
15403        );
15404        direct.fill(0);
15405        direct_work.fill(0);
15406        omp.fill(0);
15407        omp_work.fill(0);
15408        assert_eq!(
15409            libsais_unbwt_aux(&bwt, &mut direct, &mut direct_work, Some(&freq), 4, &aux),
15410            libsais_unbwt_aux_omp(&bwt, &mut omp, &mut omp_work, Some(&freq), 4, &aux, 2)
15411        );
15412        assert_eq!(omp, direct);
15413        assert_eq!(omp, text);
15414    }
15415
15416    #[test]
15417    fn libsais_ctx_matches_plain_entry_point_for_small_text() {
15418        let t = b"mississippi";
15419        let mut sa_plain = vec![0; t.len()];
15420        let mut sa_ctx = vec![0; t.len()];
15421        let plain = libsais(t, &mut sa_plain, 0, None);
15422
15423        let mut ctx = create_ctx().expect("context");
15424        let with_ctx = libsais_ctx(&mut ctx, t, &mut sa_ctx, 0, None);
15425
15426        assert_eq!(plain, 0);
15427        assert_eq!(with_ctx, 0);
15428        assert_eq!(sa_ctx, sa_plain);
15429    }
15430
15431    #[test]
15432    fn libsais_ctx_frequency_wrappers_match_direct_calls() {
15433        let text = b"banana";
15434        let gsa_text = b"ban\0ana\0";
15435        let mut ctx = create_ctx().expect("context");
15436
15437        let mut direct_sa = vec![0; text.len()];
15438        let mut ctx_sa = vec![0; text.len()];
15439        let mut direct_freq = vec![-1; ALPHABET_SIZE];
15440        let mut ctx_freq = vec![-1; ALPHABET_SIZE];
15441        assert_eq!(libsais(text, &mut direct_sa, 0, Some(&mut direct_freq)), 0);
15442        assert_eq!(
15443            libsais_ctx(&mut ctx, text, &mut ctx_sa, 0, Some(&mut ctx_freq)),
15444            0
15445        );
15446        assert_eq!(ctx_sa, direct_sa);
15447        assert_eq!(ctx_freq, direct_freq);
15448
15449        let mut direct_gsa = vec![0; gsa_text.len()];
15450        let mut ctx_gsa = vec![0; gsa_text.len()];
15451        direct_freq.fill(-1);
15452        ctx_freq.fill(-1);
15453        assert_eq!(
15454            libsais_gsa(gsa_text, &mut direct_gsa, 0, Some(&mut direct_freq)),
15455            0
15456        );
15457        assert_eq!(
15458            libsais_gsa_ctx(&mut ctx, gsa_text, &mut ctx_gsa, 0, Some(&mut ctx_freq)),
15459            0
15460        );
15461        assert_eq!(ctx_gsa, direct_gsa);
15462        assert_eq!(ctx_freq, direct_freq);
15463
15464        let mut direct_bwt = vec![0; text.len()];
15465        let mut direct_work = vec![0; text.len()];
15466        let mut ctx_bwt = vec![0; text.len()];
15467        let mut ctx_work = vec![0; text.len()];
15468        direct_freq.fill(-1);
15469        ctx_freq.fill(-1);
15470        assert_eq!(
15471            libsais_bwt(
15472                text,
15473                &mut direct_bwt,
15474                &mut direct_work,
15475                0,
15476                Some(&mut direct_freq)
15477            ),
15478            libsais_bwt_ctx(
15479                &mut ctx,
15480                text,
15481                &mut ctx_bwt,
15482                &mut ctx_work,
15483                0,
15484                Some(&mut ctx_freq)
15485            )
15486        );
15487        assert_eq!(ctx_bwt, direct_bwt);
15488        assert_eq!(ctx_freq, direct_freq);
15489
15490        let mut direct_aux = vec![0; 2];
15491        let mut ctx_aux = vec![0; 2];
15492        direct_freq.fill(-1);
15493        ctx_freq.fill(-1);
15494        assert_eq!(
15495            libsais_bwt_aux(
15496                text,
15497                &mut direct_bwt,
15498                &mut direct_work,
15499                0,
15500                Some(&mut direct_freq),
15501                4,
15502                &mut direct_aux
15503            ),
15504            libsais_bwt_aux_ctx(
15505                &mut ctx,
15506                text,
15507                &mut ctx_bwt,
15508                &mut ctx_work,
15509                0,
15510                Some(&mut ctx_freq),
15511                4,
15512                &mut ctx_aux
15513            )
15514        );
15515        assert_eq!(ctx_bwt, direct_bwt);
15516        assert_eq!(ctx_aux, direct_aux);
15517        assert_eq!(ctx_freq, direct_freq);
15518    }
15519
15520    #[test]
15521    fn libsais_unbwt_ctx_frequency_wrappers_match_direct_calls() {
15522        let text = b"abracadabra";
15523        let mut freq = vec![0; ALPHABET_SIZE];
15524        let mut bwt = vec![0; text.len()];
15525        let mut work = vec![0; text.len()];
15526        let primary = libsais_bwt(text, &mut bwt, &mut work, 0, Some(&mut freq));
15527        assert!(primary >= 0);
15528
15529        let mut ctx = unbwt_create_ctx().expect("unbwt context");
15530        let mut direct = vec![0; text.len()];
15531        let mut direct_work = vec![0; text.len() + 1];
15532        let mut via_ctx = vec![0; text.len()];
15533        let mut ctx_work = vec![0; text.len() + 1];
15534        assert_eq!(
15535            libsais_unbwt(&bwt, &mut direct, &mut direct_work, Some(&freq), primary),
15536            libsais_unbwt_ctx(
15537                &mut ctx,
15538                &bwt,
15539                &mut via_ctx,
15540                &mut ctx_work,
15541                Some(&freq),
15542                primary
15543            )
15544        );
15545        assert_eq!(via_ctx, direct);
15546        assert_eq!(via_ctx, text);
15547
15548        let mut aux = vec![0; (text.len() - 1) / 4 + 1];
15549        assert_eq!(
15550            libsais_bwt_aux(text, &mut bwt, &mut work, 0, Some(&mut freq), 4, &mut aux),
15551            0
15552        );
15553        direct.fill(0);
15554        direct_work.fill(0);
15555        via_ctx.fill(0);
15556        ctx_work.fill(0);
15557        assert_eq!(
15558            libsais_unbwt_aux(&bwt, &mut direct, &mut direct_work, Some(&freq), 4, &aux),
15559            libsais_unbwt_aux_ctx(
15560                &mut ctx,
15561                &bwt,
15562                &mut via_ctx,
15563                &mut ctx_work,
15564                Some(&freq),
15565                4,
15566                &aux
15567            )
15568        );
15569        assert_eq!(via_ctx, direct);
15570        assert_eq!(via_ctx, text);
15571    }
15572
15573    #[test]
15574    fn libsais_int_matches_bruteforce_suffix_array_for_small_integer_text() {
15575        let mut t = vec![2, 1, 3, 1, 0];
15576        let expected = {
15577            let mut sa: Vec<SaSint> = (0..t.len())
15578                .map(|index| SaSint::try_from(index).expect("index must fit SaSint"))
15579                .collect();
15580            sa.sort_by(|&lhs, &rhs| {
15581                t[usize::try_from(lhs).expect("non-negative")..]
15582                    .cmp(&t[usize::try_from(rhs).expect("non-negative")..])
15583            });
15584            sa
15585        };
15586        let mut sa = vec![0; t.len()];
15587
15588        let result = libsais_int(&mut t, &mut sa, 4, 0);
15589
15590        assert_eq!(result, 0);
15591        assert_eq!(sa, expected);
15592    }
15593
15594    #[test]
15595    fn libsais_plcp_matches_bruteforce_for_small_text() {
15596        let t = b"banana";
15597        let sa = brute_force_suffix_array_u8(t);
15598        let expected = brute_force_plcp_u8(t, &sa);
15599        let mut plcp = vec![0; t.len()];
15600
15601        let result = libsais_plcp(t, &sa, &mut plcp);
15602
15603        assert_eq!(result, 0);
15604        assert_eq!(plcp, expected);
15605    }
15606
15607    #[test]
15608    fn libsais_plcp_gsa_stops_at_separator() {
15609        let t = b"ab\0b\0";
15610        let sa = brute_force_suffix_array_u8(t);
15611        let mut plcp = vec![0; t.len()];
15612
15613        let result = libsais_plcp_gsa(t, &sa, &mut plcp);
15614
15615        assert_eq!(result, 0);
15616        assert_eq!(plcp[2], 0);
15617        assert_eq!(plcp[4], 0);
15618    }
15619
15620    #[test]
15621    fn libsais_lcp_matches_bruteforce_for_small_text() {
15622        let t = b"banana";
15623        let sa = brute_force_suffix_array_u8(t);
15624        let plcp = brute_force_plcp_u8(t, &sa);
15625        let expected = brute_force_lcp_from_sa_u8(t, &sa);
15626        let mut lcp = vec![0; t.len()];
15627
15628        let result = libsais_lcp(&plcp, &sa, &mut lcp);
15629
15630        assert_eq!(result, 0);
15631        assert_eq!(lcp, expected);
15632    }
15633
15634    #[test]
15635    fn libsais_ctx_rejects_invalid_public_arguments() {
15636        let text = b"banana";
15637        let mut ctx = create_ctx().expect("context");
15638        let mut short_sa = vec![0; text.len() - 1];
15639        let mut full_sa = vec![0; text.len()];
15640        let mut short_freq = vec![0; ALPHABET_SIZE - 1];
15641        let mut short_u = vec![0; text.len() - 1];
15642        let mut full_u = vec![0; text.len()];
15643        let mut short_a = vec![0; text.len() - 1];
15644        let mut full_a = vec![0; text.len()];
15645        let mut aux = vec![0; 2];
15646
15647        assert_eq!(libsais_ctx(&mut ctx, text, &mut short_sa, 0, None), -1);
15648        assert_eq!(
15649            libsais_ctx(&mut ctx, text, &mut full_sa, 0, Some(&mut short_freq)),
15650            -1
15651        );
15652        assert_eq!(
15653            libsais_gsa_ctx(&mut ctx, b"banana", &mut full_sa, 0, None),
15654            -1
15655        );
15656        assert_eq!(
15657            libsais_gsa_ctx(&mut ctx, b"banana\0", &mut short_sa, 0, None),
15658            -1
15659        );
15660        assert_eq!(
15661            libsais_bwt_ctx(&mut ctx, text, &mut short_u, &mut full_a, 0, None),
15662            -1
15663        );
15664        assert_eq!(
15665            libsais_bwt_ctx(&mut ctx, text, &mut full_u, &mut short_a, 0, None),
15666            -1
15667        );
15668        assert_eq!(
15669            libsais_bwt_ctx(
15670                &mut ctx,
15671                text,
15672                &mut full_u,
15673                &mut full_a,
15674                0,
15675                Some(&mut short_freq)
15676            ),
15677            -1
15678        );
15679        assert_eq!(
15680            libsais_bwt_aux_ctx(
15681                &mut ctx,
15682                text,
15683                &mut full_u,
15684                &mut full_a,
15685                0,
15686                None,
15687                0,
15688                &mut aux
15689            ),
15690            -1
15691        );
15692        assert_eq!(
15693            libsais_bwt_aux_ctx(
15694                &mut ctx,
15695                text,
15696                &mut full_u,
15697                &mut full_a,
15698                0,
15699                None,
15700                3,
15701                &mut aux
15702            ),
15703            -1
15704        );
15705        assert_eq!(
15706            libsais_bwt_aux_ctx(
15707                &mut ctx,
15708                text,
15709                &mut full_u,
15710                &mut full_a,
15711                0,
15712                None,
15713                4,
15714                &mut []
15715            ),
15716            -1
15717        );
15718
15719        let mut missing_thread_state_ctx = Context {
15720            buckets: vec![0; 8 * ALPHABET_SIZE],
15721            thread_state: None,
15722            threads: 2,
15723        };
15724        assert_eq!(
15725            libsais_ctx(&mut missing_thread_state_ctx, text, &mut full_sa, 0, None),
15726            -2
15727        );
15728
15729        let mut zero_thread_ctx = Context {
15730            buckets: vec![0; 8 * ALPHABET_SIZE],
15731            thread_state: None,
15732            threads: 0,
15733        };
15734        assert_eq!(
15735            libsais_ctx(&mut zero_thread_ctx, text, &mut full_sa, 0, None),
15736            -2
15737        );
15738
15739        let mut short_thread_state_ctx = create_ctx_main(2).expect("context");
15740        short_thread_state_ctx
15741            .thread_state
15742            .as_mut()
15743            .expect("thread state")
15744            .truncate(1);
15745        assert_eq!(
15746            libsais_ctx(&mut short_thread_state_ctx, text, &mut full_sa, 0, None),
15747            -2
15748        );
15749    }
15750
15751    #[test]
15752    fn libsais_unbwt_ctx_rejects_invalid_public_arguments() {
15753        let text = b"banana";
15754        let mut bwt = vec![0; text.len()];
15755        let mut work = vec![0; text.len()];
15756        let primary = libsais_bwt(text, &mut bwt, &mut work, 0, None);
15757        let mut ctx = unbwt_create_ctx().expect("context");
15758
15759        let mut short_u = vec![0; text.len() - 1];
15760        let mut full_u = vec![0; text.len()];
15761        let mut short_a = vec![0; text.len() - 1];
15762        let mut full_a = vec![0; text.len()];
15763        let short_freq = vec![0; ALPHABET_SIZE - 1];
15764        let good_aux = vec![primary, 4];
15765
15766        assert_eq!(
15767            libsais_unbwt_ctx(&mut ctx, &bwt, &mut short_u, &mut full_a, None, primary),
15768            -1
15769        );
15770        assert_eq!(
15771            libsais_unbwt_ctx(&mut ctx, &bwt, &mut full_u, &mut short_a, None, primary),
15772            -1
15773        );
15774        assert_eq!(
15775            libsais_unbwt_ctx(
15776                &mut ctx,
15777                &bwt,
15778                &mut full_u,
15779                &mut full_a,
15780                Some(&short_freq),
15781                primary
15782            ),
15783            -1
15784        );
15785        assert_eq!(
15786            libsais_unbwt_ctx(&mut ctx, &bwt, &mut full_u, &mut full_a, None, 0),
15787            -1
15788        );
15789        assert_eq!(
15790            libsais_unbwt_aux_ctx(&mut ctx, &bwt, &mut full_u, &mut full_a, None, 3, &good_aux),
15791            -1
15792        );
15793        assert_eq!(
15794            libsais_unbwt_aux_ctx(
15795                &mut ctx,
15796                &bwt,
15797                &mut full_u,
15798                &mut full_a,
15799                None,
15800                4,
15801                &[primary]
15802            ),
15803            -1
15804        );
15805
15806        let mut malformed_ctx = UnbwtContext {
15807            bucket2: Vec::new(),
15808            fastbits: Vec::new(),
15809            buckets: None,
15810            threads: 1,
15811        };
15812        assert_eq!(
15813            libsais_unbwt_ctx(
15814                &mut malformed_ctx,
15815                &bwt,
15816                &mut full_u,
15817                &mut full_a,
15818                None,
15819                primary
15820            ),
15821            -2
15822        );
15823
15824        let mut missing_parallel_buckets_ctx = UnbwtContext {
15825            bucket2: vec![0; ALPHABET_SIZE * ALPHABET_SIZE],
15826            fastbits: vec![0; 1 + (1 << UNBWT_FASTBITS)],
15827            buckets: None,
15828            threads: 2,
15829        };
15830        assert_eq!(
15831            libsais_unbwt_ctx(
15832                &mut missing_parallel_buckets_ctx,
15833                &bwt,
15834                &mut full_u,
15835                &mut full_a,
15836                None,
15837                primary
15838            ),
15839            -2
15840        );
15841    }
15842
15843    #[test]
15844    fn unbwt_create_ctx_main_allocates_expected_buffers() {
15845        let ctx = unbwt_create_ctx_main(3).expect("context");
15846        assert_eq!(ctx.bucket2.len(), ALPHABET_SIZE * ALPHABET_SIZE);
15847        assert_eq!(ctx.fastbits.len(), 1 + (1 << UNBWT_FASTBITS));
15848        assert_eq!(
15849            ctx.buckets.as_ref().expect("parallel buckets").len(),
15850            3 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)
15851        );
15852        assert_eq!(ctx.threads, 3);
15853    }
15854
15855    #[test]
15856    fn unbwt_compute_histogram_counts_bytes() {
15857        let t = b"banana";
15858        let mut count = vec![0u32; ALPHABET_SIZE];
15859        unbwt_compute_histogram(t, t.len() as FastSint, &mut count);
15860        assert_eq!(count[b'a' as usize], 3);
15861        assert_eq!(count[b'b' as usize], 1);
15862        assert_eq!(count[b'n' as usize], 2);
15863    }
15864
15865    #[test]
15866    fn unbwt_transpose_bucket2_swaps_matrix_entries() {
15867        let mut bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15868        bucket2[(2 << 8) + 1] = 7;
15869        bucket2[(1 << 8) + 2] = 9;
15870        unbwt_transpose_bucket2(&mut bucket2);
15871        assert_eq!(bucket2[(1 << 8) + 2], 7);
15872        assert_eq!(bucket2[(2 << 8) + 1], 9);
15873    }
15874
15875    #[test]
15876    fn unbwt_init_single_builds_monotone_fastbits_and_writes_psi() {
15877        let t = b"annb\x00aa";
15878        let mut p = vec![0u32; t.len() + 1];
15879        let mut bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15880        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15881        let i = vec![4u32];
15882
15883        unbwt_init_single(
15884            t,
15885            &mut p,
15886            t.len() as SaSint,
15887            None,
15888            &i,
15889            &mut bucket2,
15890            &mut fastbits,
15891        );
15892
15893        assert!(fastbits
15894            .iter()
15895            .all(|&value| usize::from(value) < ALPHABET_SIZE * ALPHABET_SIZE));
15896        assert!(fastbits.iter().any(|&value| value != 0));
15897        assert!(p.iter().any(|&value| value != 0));
15898    }
15899
15900    #[test]
15901    fn unbwt_init_parallel_currently_matches_single_initializer() {
15902        let t = b"annb\x00aa";
15903        let mut p_single = vec![0u32; t.len() + 1];
15904        let mut p_parallel = vec![0u32; t.len() + 1];
15905        let mut bucket2_single = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15906        let mut bucket2_parallel = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15907        let mut fastbits_single = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15908        let mut fastbits_parallel = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15909        let i = vec![4u32];
15910        let mut scratch = vec![0u32; 2 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)];
15911
15912        unbwt_init_single(
15913            t,
15914            &mut p_single,
15915            t.len() as SaSint,
15916            None,
15917            &i,
15918            &mut bucket2_single,
15919            &mut fastbits_single,
15920        );
15921        unbwt_init_parallel(
15922            t,
15923            &mut p_parallel,
15924            t.len() as SaSint,
15925            None,
15926            &i,
15927            &mut bucket2_parallel,
15928            &mut fastbits_parallel,
15929            Some(&mut scratch),
15930            2,
15931        );
15932
15933        assert_eq!(p_parallel, p_single);
15934        assert_eq!(bucket2_parallel, bucket2_single);
15935        assert_eq!(fastbits_parallel, fastbits_single);
15936    }
15937
15938    #[test]
15939    fn unbwt_init_parallel_uses_block_partition_for_large_inputs() {
15940        let n = 70_003usize;
15941        let t: Vec<u8> = (0..n)
15942            .map(|i| i.wrapping_mul(37).wrapping_add(i >> 3) as u8)
15943            .collect();
15944        let i = [12_345u32];
15945
15946        let mut single_p = vec![0u32; n + 1];
15947        let mut threaded_p = vec![0u32; n + 1];
15948        let mut single_bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15949        let mut threaded_bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15950        let mut single_fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15951        let mut threaded_fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15952        let mut buckets = vec![0u32; 4 * (ALPHABET_SIZE + ALPHABET_SIZE * ALPHABET_SIZE)];
15953
15954        unbwt_init_single(
15955            &t,
15956            &mut single_p,
15957            n as SaSint,
15958            None,
15959            &i,
15960            &mut single_bucket2,
15961            &mut single_fastbits,
15962        );
15963        unbwt_init_parallel(
15964            &t,
15965            &mut threaded_p,
15966            n as SaSint,
15967            None,
15968            &i,
15969            &mut threaded_bucket2,
15970            &mut threaded_fastbits,
15971            Some(&mut buckets),
15972            4,
15973        );
15974
15975        assert_eq!(threaded_p, single_p);
15976        assert_eq!(threaded_bucket2, single_bucket2);
15977        assert_eq!(threaded_fastbits, single_fastbits);
15978    }
15979
15980    #[test]
15981    fn unbwt_decode_1_writes_big_endian_symbol_words() {
15982        let mut u = vec![0u8; 4];
15983        let p = vec![1u32, 0u32];
15984        let mut bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
15985        bucket2[0x1234] = 0;
15986        bucket2[0x1235] = 2;
15987        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
15988        fastbits[0] = 0x1234;
15989        let mut i0 = 0usize;
15990
15991        unbwt_decode_1(&mut u, &p, &bucket2, &fastbits, 0, &mut i0, 2);
15992
15993        assert_eq!(u, vec![0x12, 0x35, 0x12, 0x35]);
15994        assert_eq!(i0, 0);
15995    }
15996
15997    #[test]
15998    fn unbwt_decode_dispatches_two_block_tail_shape() {
15999        let mut u = vec![0u8; 8];
16000        let p = vec![1u32, 0u32];
16001        let mut bucket2 = vec![0u32; ALPHABET_SIZE * ALPHABET_SIZE];
16002        bucket2[0x1234] = 0;
16003        bucket2[0x1235] = 2;
16004        let mut fastbits = vec![0u16; 1 + (1 << UNBWT_FASTBITS)];
16005        fastbits[0] = 0x1234;
16006        let i = vec![0u32, 0u32];
16007
16008        unbwt_decode(&mut u, &p, 4, 2, &i, &bucket2, &fastbits, 2, 2);
16009
16010        assert_eq!(u, vec![0x12, 0x35, 0x12, 0x35, 0x00, 0x00, 0x00, 0x00]);
16011    }
16012
16013    #[test]
16014    fn libsais_unbwt_aux_rejects_invalid_sampling_range() {
16015        let t = b"abc";
16016        let mut u = vec![0u8; t.len()];
16017        let mut a = vec![0i32; t.len()];
16018
16019        let result = libsais_unbwt_aux(t, &mut u, &mut a, None, 2, &[0, 4]);
16020
16021        assert_eq!(result, -1);
16022
16023        assert_eq!(libsais_unbwt_aux(t, &mut u, &mut a, None, 0, &[1]), -1);
16024
16025        let mut ctx = unbwt_create_ctx().expect("context");
16026        assert_eq!(
16027            libsais_unbwt_aux_ctx(&mut ctx, t, &mut u, &mut a, None, 0, &[1]),
16028            -1
16029        );
16030        assert_eq!(
16031            libsais_unbwt_aux_omp(t, &mut u, &mut a, None, 0, &[1], 2),
16032            -1
16033        );
16034    }
16035
16036    #[test]
16037    fn libsais_bwt_and_unbwt_round_trip_small_text() {
16038        let t = b"banana";
16039        let mut bwt = vec![0u8; t.len()];
16040        let mut a = vec![0i32; t.len()];
16041
16042        let primary = libsais_bwt(t, &mut bwt, &mut a, 0, None);
16043        assert!(primary > 0);
16044
16045        let mut restored = vec![0u8; t.len()];
16046        let result = libsais_unbwt(&bwt, &mut restored, &mut a, None, primary);
16047
16048        assert_eq!(result, 0);
16049        assert_eq!(restored, t);
16050    }
16051
16052    #[test]
16053    fn libsais_bwt_aux_and_unbwt_aux_round_trip_small_text() {
16054        let t = b"mississippi";
16055        let mut bwt = vec![0u8; t.len()];
16056        let mut a = vec![0i32; t.len()];
16057        let mut samples = vec![0i32; 4];
16058
16059        let result = libsais_bwt_aux(t, &mut bwt, &mut a, 0, None, 4, &mut samples);
16060        assert_eq!(result, 0);
16061
16062        let mut restored = vec![0u8; t.len()];
16063        let result = libsais_unbwt_aux(&bwt, &mut restored, &mut a, None, 4, &samples);
16064
16065        assert_eq!(result, 0);
16066        assert_eq!(restored, t);
16067    }
16068
16069    #[test]
16070    fn libsais_bwt_aux_and_unbwt_aux_omp_round_trip_small_text() {
16071        let t = b"mississippi";
16072        let mut bwt = vec![0u8; t.len()];
16073        let mut a = vec![0i32; t.len()];
16074        let mut samples = vec![0i32; 4];
16075
16076        let result = libsais_bwt_aux(t, &mut bwt, &mut a, 0, None, 4, &mut samples);
16077        assert_eq!(result, 0);
16078
16079        let mut restored = vec![0u8; t.len()];
16080        let result = libsais_unbwt_aux_omp(&bwt, &mut restored, &mut a, None, 4, &samples, 2);
16081
16082        assert_eq!(result, 0);
16083        assert_eq!(restored, t);
16084    }
16085
16086    #[test]
16087    fn real_world_round_trip_on_upstream_readme() {
16088        let t = include_bytes!("../libsais/README.md");
16089        let mut bwt = vec![0u8; t.len()];
16090        let mut a = vec![0i32; t.len()];
16091
16092        let primary = libsais_bwt(t, &mut bwt, &mut a, 0, None);
16093        assert!(primary > 0);
16094
16095        let mut restored = vec![0u8; t.len()];
16096        let result = libsais_unbwt(&bwt, &mut restored, &mut a, None, primary);
16097
16098        assert_eq!(result, 0);
16099        assert_eq!(restored, t);
16100    }
16101
16102    #[test]
16103    fn real_world_aux_omp_round_trip_on_upstream_c_source() {
16104        let t = include_bytes!("../libsais/src/libsais.c");
16105        let mut bwt = vec![0u8; t.len()];
16106        let mut a = vec![0i32; t.len()];
16107        let r = 128i32;
16108        let mut samples = vec![0i32; (t.len() - 1) / usize::try_from(r).expect("fits") + 1];
16109
16110        let result = libsais_bwt_aux(t, &mut bwt, &mut a, 0, None, r, &mut samples);
16111        assert_eq!(result, 0);
16112
16113        let mut restored = vec![0u8; t.len()];
16114        let result = libsais_unbwt_aux_omp(&bwt, &mut restored, &mut a, None, r, &samples, 2);
16115
16116        assert_eq!(result, 0);
16117        assert_eq!(restored, t);
16118    }
16119
16120    #[test]
16121    fn libsais_bwt_aux_rejects_undersized_sampling_array() {
16122        let t = b"upstream source text";
16123        let mut bwt = vec![0u8; t.len()];
16124        let mut a = vec![0i32; t.len()];
16125        let mut samples = vec![0i32; 1];
16126
16127        let result = libsais_bwt_aux(t, &mut bwt, &mut a, 0, None, 2, &mut samples);
16128
16129        assert_eq!(result, -1);
16130
16131        let result = libsais_bwt_aux(t, &mut bwt, &mut a, 0, None, 0, &mut samples);
16132
16133        assert_eq!(result, -1);
16134    }
16135
16136    #[test]
16137    fn libsais_bwt_aux_omp_rejects_invalid_sampling_rate_without_panicking() {
16138        let t = b"upstream source text";
16139        let mut bwt = vec![0u8; t.len()];
16140        let mut a = vec![0i32; t.len()];
16141        let mut samples = vec![0i32; 4];
16142
16143        let result = libsais_bwt_aux_omp(t, &mut bwt, &mut a, 0, None, 0, &mut samples, 2);
16144
16145        assert_eq!(result, -1);
16146    }
16147
16148    #[test]
16149    fn count_helpers_match_c_predicates() {
16150        let sa = [1, -1, 0, -3, 4, 0, -9];
16151        assert_eq!(
16152            count_negative_marked_suffixes(&sa, 0, sa.len() as FastSint),
16153            3
16154        );
16155        assert_eq!(count_zero_marked_suffixes(&sa, 0, sa.len() as FastSint), 2);
16156        assert_eq!(count_negative_marked_suffixes(&sa, 2, 3), 1);
16157        assert_eq!(count_zero_marked_suffixes(&sa, 2, 3), 1);
16158    }
16159
16160    #[test]
16161    fn flip_suffix_markers_omp_toggles_saint_min_bits() {
16162        let mut sa = vec![1, -2, 3, -4];
16163        flip_suffix_markers_omp(&mut sa, 4, 1);
16164        assert_eq!(
16165            sa,
16166            vec![1 ^ SAINT_MIN, -2 ^ SAINT_MIN, 3 ^ SAINT_MIN, -4 ^ SAINT_MIN]
16167        );
16168    }
16169
16170    #[test]
16171    fn flip_suffix_markers_omp_uses_block_partition_for_large_inputs() {
16172        let n = 65_600usize;
16173        let mut single: Vec<SaSint> = (0..n).map(|i| (i as SaSint) ^ SAINT_MIN).collect();
16174        let mut threaded = single.clone();
16175
16176        flip_suffix_markers_omp(&mut single, n as SaSint, 1);
16177        flip_suffix_markers_omp(&mut threaded, n as SaSint, 4);
16178
16179        assert_eq!(threaded, single);
16180    }
16181
16182    #[test]
16183    fn place_cached_suffixes_writes_indices_to_symbol_slots() {
16184        let mut sa = vec![0; 8];
16185        let cache = vec![
16186            ThreadCache {
16187                symbol: 2,
16188                index: 10,
16189            },
16190            ThreadCache {
16191                symbol: 5,
16192                index: 20,
16193            },
16194            ThreadCache {
16195                symbol: 1,
16196                index: 30,
16197            },
16198        ];
16199
16200        place_cached_suffixes(&mut sa, &cache, 0, cache.len() as FastSint);
16201
16202        assert_eq!(sa[2], 10);
16203        assert_eq!(sa[5], 20);
16204        assert_eq!(sa[1], 30);
16205    }
16206
16207    #[test]
16208    fn compact_and_place_cached_suffixes_discards_negative_symbols() {
16209        let mut sa = vec![0; 8];
16210        let mut cache = vec![
16211            ThreadCache {
16212                symbol: 2,
16213                index: 10,
16214            },
16215            ThreadCache {
16216                symbol: -1,
16217                index: 99,
16218            },
16219            ThreadCache {
16220                symbol: 5,
16221                index: 20,
16222            },
16223            ThreadCache {
16224                symbol: -4,
16225                index: 77,
16226            },
16227            ThreadCache {
16228                symbol: 1,
16229                index: 30,
16230            },
16231        ];
16232        let cache_len = cache.len() as FastSint;
16233
16234        compact_and_place_cached_suffixes(&mut sa, &mut cache, 0, cache_len);
16235
16236        assert_eq!(sa[2], 10);
16237        assert_eq!(sa[5], 20);
16238        assert_eq!(sa[1], 30);
16239        assert_eq!(
16240            cache[0],
16241            ThreadCache {
16242                symbol: 2,
16243                index: 10
16244            }
16245        );
16246        assert_eq!(
16247            cache[1],
16248            ThreadCache {
16249                symbol: 5,
16250                index: 20
16251            }
16252        );
16253        assert_eq!(
16254            cache[2],
16255            ThreadCache {
16256                symbol: 1,
16257                index: 30
16258            }
16259        );
16260    }
16261
16262    #[test]
16263    fn gather_lms_suffixes_32s_collects_expected_suffix_starts() {
16264        let t = vec![2, 1, 3, 1, 0];
16265        let mut sa = vec![0; t.len()];
16266        let m = gather_lms_suffixes_32s(&t, &mut sa, t.len() as SaSint);
16267        assert!(m >= 0);
16268        assert!(sa
16269            .iter()
16270            .all(|&value| value >= 0 && value <= t.len() as SaSint));
16271        assert!(sa[t.len() - 1] >= 1 && sa[t.len() - 1] <= t.len() as SaSint - 1);
16272    }
16273
16274    #[test]
16275    fn gather_compacted_lms_suffixes_32s_skips_negative_marked_symbols() {
16276        let t = vec![2, -1, 3, 1, 0];
16277        let mut sa = vec![0; t.len()];
16278        let m = gather_compacted_lms_suffixes_32s(&t, &mut sa, t.len() as SaSint);
16279        assert!(m >= 0);
16280        assert!(sa
16281            .iter()
16282            .all(|&value| value >= 0 && value <= t.len() as SaSint));
16283    }
16284
16285    #[test]
16286    fn count_lms_suffixes_32s_2k_counts_two_bucket_categories() {
16287        let t = vec![2, 1, 3, 1, 0];
16288        let mut buckets = vec![0; 2 * 4];
16289        count_lms_suffixes_32s_2k(&t, t.len() as SaSint, 4, &mut buckets);
16290        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16291    }
16292
16293    #[test]
16294    fn count_lms_suffixes_32s_4k_counts_four_bucket_categories() {
16295        let t = vec![2, 1, 3, 1, 0];
16296        let mut buckets = vec![0; 4 * 4];
16297        count_lms_suffixes_32s_4k(&t, t.len() as SaSint, 4, &mut buckets);
16298        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16299    }
16300
16301    #[test]
16302    fn count_compacted_lms_suffixes_32s_2k_masks_saint_bits() {
16303        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
16304        let mut buckets = vec![0; 2 * 4];
16305        count_compacted_lms_suffixes_32s_2k(&t, t.len() as SaSint, 4, &mut buckets);
16306        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16307    }
16308
16309    #[test]
16310    fn count_and_gather_lms_suffixes_8u_updates_sa_and_buckets() {
16311        let t = vec![2_u8, 1, 3, 1, 0];
16312        let mut sa = vec![0; t.len()];
16313        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
16314        let m = count_and_gather_lms_suffixes_8u(
16315            &t,
16316            &mut sa,
16317            t.len() as SaSint,
16318            &mut buckets,
16319            0,
16320            t.len() as FastSint,
16321        );
16322        assert_eq!(m, 1);
16323        assert_eq!(sa[t.len() - 1], 1);
16324        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
16325    }
16326
16327    #[test]
16328    fn get_bucket_stride_prefers_aligned_sizes_when_space_allows() {
16329        assert_eq!(get_bucket_stride(8192, 1000, 2), 1024);
16330        assert_eq!(get_bucket_stride(256, 17, 2), 32);
16331        assert_eq!(get_bucket_stride(8, 17, 2), 17);
16332    }
16333
16334    #[test]
16335    fn count_suffixes_32s_counts_symbol_histogram() {
16336        let t = vec![2, 1, 2, 3, 1, 0, 2];
16337        let mut buckets = vec![0; 4];
16338        count_suffixes_32s(&t, t.len() as SaSint, 4, &mut buckets);
16339        assert_eq!(buckets, vec![1, 2, 3, 1]);
16340    }
16341
16342    #[test]
16343    fn initialize_buckets_start_and_end_8u_sets_ranges_and_freq() {
16344        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
16345        buckets[buckets_index4(0, 0)] = 1;
16346        buckets[buckets_index4(1, 1)] = 2;
16347        buckets[buckets_index4(2, 3)] = 3;
16348        let mut freq = vec![0; ALPHABET_SIZE];
16349        let k = initialize_buckets_start_and_end_8u(&mut buckets, Some(&mut freq));
16350        assert_eq!(k, 3);
16351        assert_eq!(freq[0], 1);
16352        assert_eq!(freq[1], 2);
16353        assert_eq!(freq[2], 3);
16354        assert_eq!(buckets[6 * ALPHABET_SIZE], 0);
16355        assert_eq!(buckets[7 * ALPHABET_SIZE], 1);
16356        assert_eq!(buckets[6 * ALPHABET_SIZE + 1], 1);
16357        assert_eq!(buckets[7 * ALPHABET_SIZE + 1], 3);
16358    }
16359
16360    #[test]
16361    fn initialize_buckets_start_and_end_32s_6k_sets_prefix_ranges() {
16362        let k = 3;
16363        let mut buckets = vec![0; 6 * k];
16364        buckets[buckets_index4(0, 0)] = 1;
16365        buckets[buckets_index4(0, 1)] = 2;
16366        buckets[buckets_index4(1, 2)] = 3;
16367        buckets[buckets_index4(2, 3)] = 4;
16368        initialize_buckets_start_and_end_32s_6k(k as SaSint, &mut buckets);
16369        assert_eq!(&buckets[4 * k..5 * k], &[0, 3, 6]);
16370        assert_eq!(&buckets[5 * k..6 * k], &[3, 6, 10]);
16371    }
16372
16373    #[test]
16374    fn initialize_buckets_start_and_end_32s_4k_sets_prefix_ranges() {
16375        let k = 3;
16376        let mut buckets = vec![0; 4 * k];
16377        buckets[buckets_index2(0, 0)] = 1;
16378        buckets[buckets_index2(0, 1)] = 2;
16379        buckets[buckets_index2(1, 0)] = 3;
16380        buckets[buckets_index2(2, 1)] = 4;
16381        initialize_buckets_start_and_end_32s_4k(k as SaSint, &mut buckets);
16382        assert_eq!(&buckets[2 * k..3 * k], &[0, 3, 6]);
16383        assert_eq!(&buckets[3 * k..4 * k], &[3, 6, 10]);
16384    }
16385
16386    #[test]
16387    fn initialize_buckets_end_32s_2k_rewrites_first_lanes_to_end_positions() {
16388        let k = 3;
16389        let mut buckets = vec![1, 2, 3, 4, 5, 6];
16390        initialize_buckets_end_32s_2k(k as SaSint, &mut buckets);
16391        assert_eq!(buckets[0], 3);
16392        assert_eq!(buckets[2], 10);
16393        assert_eq!(buckets[4], 21);
16394    }
16395
16396    #[test]
16397    fn initialize_buckets_start_and_end_32s_2k_copies_start_positions() {
16398        let k = 3;
16399        let mut buckets = vec![3, 2, 10, 4, 21, 6];
16400        initialize_buckets_start_and_end_32s_2k(k as SaSint, &mut buckets);
16401        assert_eq!(&buckets[..k], &[3, 10, 21]);
16402        assert_eq!(&buckets[k..2 * k], &[0, 3, 10]);
16403    }
16404
16405    #[test]
16406    fn initialize_buckets_start_32s_1k_builds_prefix_starts() {
16407        let mut buckets = vec![1, 2, 3];
16408        initialize_buckets_start_32s_1k(3, &mut buckets);
16409        assert_eq!(buckets, vec![0, 1, 3]);
16410    }
16411
16412    #[test]
16413    fn initialize_buckets_end_32s_1k_builds_prefix_ends() {
16414        let mut buckets = vec![1, 2, 3];
16415        initialize_buckets_end_32s_1k(3, &mut buckets);
16416        assert_eq!(buckets, vec![1, 3, 6]);
16417    }
16418
16419    #[test]
16420    fn initialize_buckets_for_lms_suffixes_radix_sort_8u_returns_total_lms_slots() {
16421        let t = vec![2_u8, 1, 3, 1, 0];
16422        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16423        buckets[buckets_index4(0, 1)] = 1;
16424        buckets[buckets_index4(1, 3)] = 2;
16425        let sum = initialize_buckets_for_lms_suffixes_radix_sort_8u(&t, &mut buckets, 4);
16426        assert!(sum >= 0);
16427    }
16428
16429    #[test]
16430    fn initialize_buckets_for_lms_suffixes_radix_sort_32s_2k_rewrites_two_lane_prefixes() {
16431        let t = vec![2, 1, 3, 1, 0];
16432        let mut buckets = vec![0; 2 * 4];
16433        initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(&t, 4, &mut buckets, 4);
16434        assert!(buckets.iter().any(|&v| v != 0));
16435    }
16436
16437    #[test]
16438    fn initialize_buckets_for_lms_suffixes_radix_sort_32s_6k_returns_total_lms_slots() {
16439        let t = vec![2, 1, 3, 1, 0];
16440        let mut buckets = vec![0; 6 * 4];
16441        buckets[buckets_index4(0, 1)] = 1;
16442        buckets[buckets_index4(1, 3)] = 2;
16443        let sum = initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(&t, 4, &mut buckets, 4);
16444        assert!(sum >= 0);
16445    }
16446
16447    #[test]
16448    fn initialize_buckets_for_radix_and_partial_sorting_32s_4k_sets_start_end_views() {
16449        let t = vec![2, 1, 3, 1, 0];
16450        let k = 4usize;
16451        let mut buckets = vec![0; 4 * k];
16452        buckets[buckets_index2(0, 0)] = 1;
16453        buckets[buckets_index2(0, 1)] = 2;
16454        buckets[buckets_index2(1, 0)] = 3;
16455        initialize_buckets_for_radix_and_partial_sorting_32s_4k(&t, k as SaSint, &mut buckets, 4);
16456        assert_eq!(buckets[2 * k], 0);
16457        assert!(buckets[3 * k] >= buckets[2 * k]);
16458    }
16459
16460    #[test]
16461    fn radix_sort_lms_suffixes_8u_places_suffixes_by_bucket() {
16462        let t = vec![1_u8, 0, 1, 0];
16463        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
16464        let mut induction_bucket = vec![0; 2 * ALPHABET_SIZE];
16465        induction_bucket[buckets_index2(0, 0)] = 2;
16466        induction_bucket[buckets_index2(1, 0)] = 4;
16467        radix_sort_lms_suffixes_8u(&t, &mut sa, &mut induction_bucket, 4, 4);
16468        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
16469    }
16470
16471    #[test]
16472    fn radix_sort_lms_suffixes_8u_omp_wraps_sequential_version() {
16473        let t = vec![9_u8, 1, 0, 1, 0];
16474        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
16475        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16476        buckets[4 * ALPHABET_SIZE + buckets_index2(0, 0)] = 2;
16477        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 4;
16478        let mut thread_state = alloc_thread_state(2).unwrap();
16479        radix_sort_lms_suffixes_8u_omp(&t, &mut sa, 9, 5, 0, &mut buckets, 2, &mut thread_state);
16480        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
16481    }
16482
16483    #[test]
16484    fn radix_sort_lms_suffixes_8u_omp_uses_thread_state_for_large_inputs() {
16485        let m = 65_600usize;
16486        let n = 2 * m + 16;
16487        let start = n - m + 1;
16488        let t: Vec<u8> = (0..n).map(|i| (i % 4) as u8).collect();
16489        let suffixes: Vec<SaSint> = (0..m - 1).map(|i| i as SaSint).collect();
16490
16491        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16492        for &suffix in &suffixes {
16493            buckets[4 * ALPHABET_SIZE + buckets_index2(t[suffix as usize] as usize, 0)] += 1;
16494        }
16495        let mut sum = 0;
16496        for symbol in 0..ALPHABET_SIZE {
16497            let bucket = 4 * ALPHABET_SIZE + buckets_index2(symbol, 0);
16498            sum += buckets[bucket];
16499            buckets[bucket] = sum;
16500        }
16501
16502        let mut sa_single = vec![0; n];
16503        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
16504        let mut sa_threaded = sa_single.clone();
16505        let mut buckets_single = buckets.clone();
16506        let mut buckets_threaded = buckets;
16507        let mut thread_state = alloc_thread_state(4).unwrap();
16508        thread_state[3].m = m as FastSint;
16509
16510        radix_sort_lms_suffixes_8u_omp(
16511            &t,
16512            &mut sa_single,
16513            n as SaSint,
16514            m as SaSint,
16515            0,
16516            &mut buckets_single,
16517            1,
16518            &mut [],
16519        );
16520        radix_sort_lms_suffixes_8u_omp(
16521            &t,
16522            &mut sa_threaded,
16523            n as SaSint,
16524            m as SaSint,
16525            0,
16526            &mut buckets_threaded,
16527            4,
16528            &mut thread_state,
16529        );
16530
16531        assert_eq!(sa_threaded, sa_single);
16532    }
16533
16534    #[test]
16535    fn radix_sort_lms_suffixes_32s_6k_places_suffixes_by_bucket() {
16536        let t = vec![1, 0, 1, 0];
16537        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
16538        let mut induction_bucket = vec![2, 4];
16539        radix_sort_lms_suffixes_32s_6k(&t, &mut sa, &mut induction_bucket, 4, 4);
16540        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
16541    }
16542
16543    #[test]
16544    fn radix_sort_lms_suffixes_32s_2k_places_suffixes_by_bucket() {
16545        let t = vec![1, 0, 1, 0];
16546        let mut sa = vec![9, 9, 9, 9, 0, 1, 2, 3];
16547        let mut induction_bucket = vec![2, 0, 4, 0];
16548        radix_sort_lms_suffixes_32s_2k(&t, &mut sa, &mut induction_bucket, 4, 4);
16549        assert_eq!(&sa[..4], &[1, 3, 0, 2]);
16550    }
16551
16552    #[test]
16553    fn radix_sort_lms_suffixes_32s_6k_omp_wraps_sequential_version() {
16554        let t = vec![9, 1, 0, 1, 0];
16555        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
16556        let mut induction_bucket = vec![2, 4];
16557        let mut thread_state = alloc_thread_state(2).unwrap();
16558        radix_sort_lms_suffixes_32s_6k_omp(
16559            &t,
16560            &mut sa,
16561            9,
16562            5,
16563            &mut induction_bucket,
16564            2,
16565            &mut thread_state,
16566        );
16567        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
16568    }
16569
16570    #[test]
16571    fn radix_sort_lms_suffixes_32s_2k_omp_wraps_sequential_version() {
16572        let t = vec![9, 1, 0, 1, 0];
16573        let mut sa = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
16574        let mut induction_bucket = vec![2, 0, 4, 0];
16575        let mut thread_state = alloc_thread_state(2).unwrap();
16576        radix_sort_lms_suffixes_32s_2k_omp(
16577            &t,
16578            &mut sa,
16579            9,
16580            5,
16581            &mut induction_bucket,
16582            2,
16583            &mut thread_state,
16584        );
16585        assert_eq!(&sa[..4], &[2, 4, 1, 3]);
16586    }
16587
16588    #[test]
16589    fn radix_sort_lms_suffixes_32s_block_omp_runs_cache_pipeline() {
16590        let t = vec![9, 1, 0, 1, 0];
16591        let mut sa_6k = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
16592        let mut bucket_6k = vec![2, 4];
16593        let mut cache = vec![ThreadCache::default(); 9];
16594        radix_sort_lms_suffixes_32s_6k_block_omp(
16595            &t,
16596            &mut sa_6k,
16597            &mut bucket_6k,
16598            &mut cache,
16599            5,
16600            4,
16601            2,
16602        );
16603        assert_eq!(&sa_6k[..4], &[2, 4, 1, 3]);
16604
16605        let mut sa_2k = vec![9, 9, 9, 9, 9, 1, 2, 3, 4];
16606        let mut bucket_2k = vec![2, 0, 4, 0];
16607        cache.fill(ThreadCache::default());
16608        radix_sort_lms_suffixes_32s_2k_block_omp(
16609            &t,
16610            &mut sa_2k,
16611            &mut bucket_2k,
16612            &mut cache,
16613            5,
16614            4,
16615            2,
16616        );
16617        assert_eq!(&sa_2k[..4], &[2, 4, 1, 3]);
16618    }
16619
16620    #[test]
16621    fn radix_sort_lms_suffixes_32s_omp_uses_block_pipeline_for_large_inputs() {
16622        let m = 65_600usize;
16623        let n = 2 * m + 16;
16624        let start = n - m + 1;
16625        let t: Vec<SaSint> = (0..n).map(|i| (i % 4) as SaSint).collect();
16626        let suffixes: Vec<SaSint> = (0..m - 1).map(|i| i as SaSint).collect();
16627
16628        let mut bucket_ends = vec![0; 4];
16629        for &suffix in &suffixes {
16630            bucket_ends[t[suffix as usize] as usize] += 1;
16631        }
16632        let mut sum = 0;
16633        for bucket in &mut bucket_ends {
16634            sum += *bucket;
16635            *bucket = sum;
16636        }
16637
16638        let mut sa_single = vec![0; n];
16639        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
16640        let mut sa_threaded = sa_single.clone();
16641        let mut bucket_single = bucket_ends.clone();
16642        let mut bucket_threaded = bucket_ends.clone();
16643        let mut thread_state = alloc_thread_state(4).unwrap();
16644
16645        radix_sort_lms_suffixes_32s_6k_omp(
16646            &t,
16647            &mut sa_single,
16648            n as SaSint,
16649            m as SaSint,
16650            &mut bucket_single,
16651            1,
16652            &mut [],
16653        );
16654        radix_sort_lms_suffixes_32s_6k_omp(
16655            &t,
16656            &mut sa_threaded,
16657            n as SaSint,
16658            m as SaSint,
16659            &mut bucket_threaded,
16660            4,
16661            &mut thread_state,
16662        );
16663        assert_eq!(sa_threaded, sa_single);
16664        assert_eq!(bucket_threaded, bucket_single);
16665
16666        let mut bucket_2k = vec![0; 8];
16667        for (symbol, &end) in bucket_ends.iter().enumerate() {
16668            bucket_2k[buckets_index2(symbol, 0)] = end;
16669        }
16670        let mut sa_single = vec![0; n];
16671        sa_single[start..start + suffixes.len()].copy_from_slice(&suffixes);
16672        let mut sa_threaded = sa_single.clone();
16673        let mut bucket_single = bucket_2k.clone();
16674        let mut bucket_threaded = bucket_2k;
16675
16676        radix_sort_lms_suffixes_32s_2k_omp(
16677            &t,
16678            &mut sa_single,
16679            n as SaSint,
16680            m as SaSint,
16681            &mut bucket_single,
16682            1,
16683            &mut [],
16684        );
16685        radix_sort_lms_suffixes_32s_2k_omp(
16686            &t,
16687            &mut sa_threaded,
16688            n as SaSint,
16689            m as SaSint,
16690            &mut bucket_threaded,
16691            4,
16692            &mut thread_state,
16693        );
16694        assert_eq!(sa_threaded, sa_single);
16695        assert_eq!(bucket_threaded, bucket_single);
16696    }
16697
16698    #[test]
16699    fn radix_sort_lms_suffixes_32s_1k_collects_lms_suffixes() {
16700        let t = vec![2, 1, 3, 1, 0];
16701        let mut sa = vec![0; t.len()];
16702        let mut buckets = vec![0, 2, 4, 5];
16703        let m = radix_sort_lms_suffixes_32s_1k(&t, &mut sa, t.len() as SaSint, &mut buckets);
16704        assert!(m >= 0);
16705    }
16706
16707    #[test]
16708    fn radix_sort_set_markers_32s_6k_marks_target_suffixes() {
16709        let mut sa = vec![0; 6];
16710        let induction_bucket = vec![1, 3, 5];
16711        radix_sort_set_markers_32s_6k(&mut sa, &induction_bucket, 0, 3);
16712        assert_eq!(sa[1], SAINT_MIN);
16713        assert_eq!(sa[3], SAINT_MIN);
16714        assert_eq!(sa[5], SAINT_MIN);
16715    }
16716
16717    #[test]
16718    fn radix_sort_set_markers_32s_4k_marks_target_suffixes() {
16719        let mut sa = vec![0; 6];
16720        let induction_bucket = vec![1, 0, 3, 0, 5, 0];
16721        radix_sort_set_markers_32s_4k(&mut sa, &induction_bucket, 0, 3);
16722        assert_eq!(sa[1], SUFFIX_GROUP_MARKER);
16723        assert_eq!(sa[3], SUFFIX_GROUP_MARKER);
16724        assert_eq!(sa[5], SUFFIX_GROUP_MARKER);
16725    }
16726
16727    #[test]
16728    fn radix_sort_set_markers_32s_6k_omp_wraps_sequential_version() {
16729        let mut sa = vec![0; 6];
16730        let induction_bucket = vec![1, 3, 5];
16731        radix_sort_set_markers_32s_6k_omp(&mut sa, 4, &induction_bucket, 2);
16732        assert_eq!(sa[1], SAINT_MIN);
16733        assert_eq!(sa[3], SAINT_MIN);
16734        assert_eq!(sa[5], SAINT_MIN);
16735    }
16736
16737    #[test]
16738    fn radix_sort_set_markers_32s_4k_omp_wraps_sequential_version() {
16739        let mut sa = vec![0; 6];
16740        let induction_bucket = vec![1, 0, 3, 0, 5, 0];
16741        radix_sort_set_markers_32s_4k_omp(&mut sa, 4, &induction_bucket, 2);
16742        assert_eq!(sa[1], SUFFIX_GROUP_MARKER);
16743        assert_eq!(sa[3], SUFFIX_GROUP_MARKER);
16744        assert_eq!(sa[5], SUFFIX_GROUP_MARKER);
16745    }
16746
16747    #[test]
16748    fn radix_sort_set_markers_32s_omp_partitions_large_inputs() {
16749        let k = 65_600usize;
16750        let induction_bucket_6k: Vec<SaSint> = (0..k).map(|i| i as SaSint).collect();
16751        let mut sa_single = vec![0; k];
16752        let mut sa_threaded = vec![0; k];
16753        radix_sort_set_markers_32s_6k_omp(&mut sa_single, k as SaSint, &induction_bucket_6k, 1);
16754        radix_sort_set_markers_32s_6k_omp(&mut sa_threaded, k as SaSint, &induction_bucket_6k, 4);
16755        assert_eq!(sa_threaded, sa_single);
16756
16757        let mut induction_bucket_4k = vec![0; 2 * k];
16758        for i in 0..k {
16759            induction_bucket_4k[buckets_index2(i, 0)] = i as SaSint;
16760        }
16761        let mut sa_single = vec![0; k];
16762        let mut sa_threaded = vec![0; k];
16763        radix_sort_set_markers_32s_4k_omp(&mut sa_single, k as SaSint, &induction_bucket_4k, 1);
16764        radix_sort_set_markers_32s_4k_omp(&mut sa_threaded, k as SaSint, &induction_bucket_4k, 4);
16765        assert_eq!(sa_threaded, sa_single);
16766    }
16767
16768    #[test]
16769    fn initialize_buckets_for_partial_sorting_8u_sets_start_and_distinct_views() {
16770        let t = vec![2_u8, 1, 3, 1, 0];
16771        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16772        buckets[buckets_index4(0, 0)] = 1;
16773        buckets[buckets_index4(0, 2)] = 2;
16774        initialize_buckets_for_partial_sorting_8u(&t, &mut buckets, 4, 3);
16775        assert!(buckets[0] >= 4);
16776        assert!(buckets[1] >= 0);
16777        assert!(buckets[4 * ALPHABET_SIZE] >= 4);
16778    }
16779
16780    #[test]
16781    fn initialize_buckets_for_partial_sorting_32s_6k_rewrites_bucket_views() {
16782        let t = vec![2, 1, 3, 1, 0];
16783        let k = 4usize;
16784        let mut buckets = vec![0; 6 * k];
16785        buckets[buckets_index4(0, 0)] = 1;
16786        buckets[buckets_index4(0, 1)] = 2;
16787        buckets[buckets_index4(1, 2)] = 3;
16788        initialize_buckets_for_partial_sorting_32s_6k(&t, k as SaSint, &mut buckets, 4, 3);
16789        assert!(buckets[0] >= 4);
16790        assert!(buckets[4 * k] >= 4);
16791    }
16792
16793    #[test]
16794    fn partial_sorting_scan_left_to_right_8u_emits_induced_suffixes() {
16795        let t = vec![2_u8, 1, 3, 1, 0];
16796        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
16797        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16798        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 2;
16799        let d = partial_sorting_scan_left_to_right_8u(&t, &mut sa, &mut buckets, 0, 0, 2);
16800        assert!(d >= 0);
16801        assert!(sa.iter().any(|&v| v != 0));
16802    }
16803
16804    #[test]
16805    fn partial_sorting_scan_left_to_right_8u_omp_wraps_sequential_version() {
16806        let t = vec![2_u8, 1, 3, 1, 0];
16807        let mut sa = vec![0; 8];
16808        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
16809        buckets[4 * ALPHABET_SIZE + buckets_index2(0, 0)] = 1;
16810        let mut thread_state = alloc_thread_state(2).unwrap();
16811        let d = partial_sorting_scan_left_to_right_8u_omp(
16812            &t,
16813            &mut sa,
16814            5,
16815            4,
16816            &mut buckets,
16817            0,
16818            0,
16819            2,
16820            &mut thread_state,
16821        );
16822        assert!(d >= 1);
16823    }
16824
16825    #[test]
16826    fn partial_sorting_scan_left_to_right_32s_6k_emits_induced_suffixes() {
16827        let t = vec![2, 1, 3, 1, 0];
16828        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
16829        let mut buckets = vec![0; 4 * 4];
16830        buckets[buckets_index4(1, 0)] = 2;
16831        let d = partial_sorting_scan_left_to_right_32s_6k(&t, &mut sa, &mut buckets, 0, 0, 2);
16832        assert!(d >= 0);
16833        assert!(sa.iter().any(|&v| v != 0));
16834    }
16835
16836    #[test]
16837    fn partial_sorting_scan_left_to_right_32s_4k_emits_induced_suffixes() {
16838        let t = vec![2, 1, 3, 1, 0];
16839        let k = 4usize;
16840        let mut sa = vec![2 | SUFFIX_GROUP_MARKER, 4, 0, 0, 0, 0];
16841        let mut buckets = vec![0; 4 * k];
16842        buckets[2 * k + 1] = 2;
16843        let d = partial_sorting_scan_left_to_right_32s_4k(
16844            &t,
16845            &mut sa,
16846            k as SaSint,
16847            &mut buckets,
16848            0,
16849            0,
16850            2,
16851        );
16852        assert!(d >= 0);
16853        assert!(sa.iter().any(|&v| v != 0));
16854    }
16855
16856    #[test]
16857    fn partial_sorting_scan_left_to_right_32s_1k_emits_induced_suffixes() {
16858        let t = vec![2, 1, 3, 1, 0];
16859        let mut sa = vec![2, 4, 0, 0, 0, 0];
16860        let mut buckets = vec![0; 4];
16861        buckets[1] = 2;
16862        partial_sorting_scan_left_to_right_32s_1k(&t, &mut sa, &mut buckets, 0, 2);
16863        assert!(sa.iter().any(|&v| v != 0));
16864    }
16865
16866    #[test]
16867    fn partial_sorting_scan_left_to_right_32s_6k_omp_wraps_sequential_version() {
16868        let t = vec![2, 1, 3, 1, 0];
16869        let mut sa = vec![0; 8];
16870        let mut buckets = vec![0; 4 * 4];
16871        let mut thread_state = alloc_thread_state(2).unwrap();
16872        let d = partial_sorting_scan_left_to_right_32s_6k_omp(
16873            &t,
16874            &mut sa,
16875            5,
16876            &mut buckets,
16877            0,
16878            0,
16879            2,
16880            &mut thread_state,
16881        );
16882        assert!(d >= 1);
16883    }
16884
16885    #[test]
16886    fn partial_sorting_scan_left_to_right_32s_4k_omp_wraps_sequential_version() {
16887        let t = vec![2, 1, 3, 1, 0];
16888        let k = 4usize;
16889        let mut sa = vec![0; 8];
16890        let mut buckets = vec![0; 4 * k];
16891        let mut thread_state = alloc_thread_state(2).unwrap();
16892        let d = partial_sorting_scan_left_to_right_32s_4k_omp(
16893            &t,
16894            &mut sa,
16895            5,
16896            k as SaSint,
16897            &mut buckets,
16898            0,
16899            2,
16900            &mut thread_state,
16901        );
16902        assert!(d >= 1);
16903    }
16904
16905    #[test]
16906    fn partial_sorting_scan_left_to_right_32s_1k_omp_wraps_sequential_version() {
16907        let t = vec![2, 1, 3, 1, 0];
16908        let mut sa = vec![0; 8];
16909        let mut buckets = vec![0; 4];
16910        let mut thread_state = alloc_thread_state(2).unwrap();
16911        partial_sorting_scan_left_to_right_32s_1k_omp(
16912            &t,
16913            &mut sa,
16914            5,
16915            &mut buckets,
16916            2,
16917            &mut thread_state,
16918        );
16919        assert!(sa.iter().any(|&v| v != 0));
16920    }
16921
16922    #[test]
16923    fn partial_sorting_scan_left_to_right_32s_6k_block_gather_records_bucket_symbols() {
16924        let t = vec![3, 1, 2, 0];
16925        let mut sa = vec![2 | SAINT_MIN, 0, 0, 0];
16926        let mut cache = vec![ThreadCache::default(); 1];
16927
16928        partial_sorting_scan_left_to_right_32s_6k_block_gather(&t, &mut sa, &mut cache, 0, 1);
16929
16930        assert_eq!(cache[0].index, 2 | SAINT_MIN);
16931        assert_eq!(cache[0].symbol, buckets_index4(1, 1) as SaSint);
16932    }
16933
16934    #[test]
16935    fn partial_sorting_scan_left_to_right_32s_1k_block_gather_zeroes_positive_entries() {
16936        let t = vec![3, 1, 2, 0];
16937        let mut sa = vec![2, 0, 0, 0];
16938        let mut cache = vec![ThreadCache::default(); 1];
16939
16940        partial_sorting_scan_left_to_right_32s_1k_block_gather(&t, &mut sa, &mut cache, 0, 1);
16941
16942        assert_eq!(cache[0].symbol, 1);
16943        assert_eq!(cache[0].index, 1);
16944        assert_eq!(sa[0], 0);
16945    }
16946
16947    #[test]
16948    fn partial_sorting_scan_left_to_right_32s_1k_block_omp_uses_relative_cache() {
16949        let block_start = 20_000usize;
16950        let block_size = 16_384usize;
16951        let n = block_start + block_size + 8;
16952        let t = vec![1; n];
16953        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
16954
16955        let mut sa_single = vec![0; n];
16956        sa_single[block_start..block_start + block_size].copy_from_slice(&suffixes);
16957        let mut sa_threaded = sa_single.clone();
16958        let mut bucket_single = vec![0, 0];
16959        let mut bucket_threaded = bucket_single.clone();
16960        let mut cache = vec![ThreadCache::default(); 4 * LIBSAIS_PER_THREAD_CACHE_SIZE];
16961
16962        partial_sorting_scan_left_to_right_32s_1k(
16963            &t,
16964            &mut sa_single,
16965            &mut bucket_single,
16966            block_start as FastSint,
16967            block_size as FastSint,
16968        );
16969        partial_sorting_scan_left_to_right_32s_1k_block_omp(
16970            &t,
16971            &mut sa_threaded,
16972            &mut bucket_threaded,
16973            &mut cache,
16974            block_start as FastSint,
16975            block_size as FastSint,
16976            4,
16977        );
16978
16979        assert_eq!(sa_threaded, sa_single);
16980        assert_eq!(bucket_threaded, bucket_single);
16981    }
16982
16983    #[test]
16984    fn partial_sorting_scan_left_to_right_8u_block_prepare_records_cache_and_counts() {
16985        let t = vec![2_u8, 1, 3, 1, 0];
16986        let sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
16987        let k = 4;
16988        let mut buckets = vec![0; 4 * k];
16989        let mut cache = vec![ThreadCache::default(); 8];
16990        let mut state = ThreadState::new();
16991        let (position, count) = partial_sorting_scan_left_to_right_8u_block_prepare(
16992            &t,
16993            &sa,
16994            k as SaSint,
16995            &mut buckets,
16996            &mut cache,
16997            0,
16998            2,
16999        );
17000        state.position = position;
17001        state.count = count;
17002        assert!(state.count >= 1);
17003        assert!(cache
17004            .iter()
17005            .take(state.count as usize)
17006            .any(|entry| entry.symbol >= 0));
17007    }
17008
17009    #[test]
17010    fn partial_sorting_scan_left_to_right_8u_block_place_writes_induced_values() {
17011        let mut sa = vec![0; 8];
17012        let mut buckets = vec![0; 8];
17013        buckets[0] = 0;
17014        buckets[1] = 1;
17015        let cache = vec![
17016            ThreadCache {
17017                index: 3 | SAINT_MIN,
17018                symbol: 0,
17019            },
17020            ThreadCache {
17021                index: 5,
17022                symbol: 1,
17023            },
17024        ];
17025        partial_sorting_scan_left_to_right_8u_block_place(&mut sa, &mut buckets, 2, &cache, 2, 0);
17026        assert!(sa[0] != 0 || sa[1] != 0);
17027    }
17028
17029    #[test]
17030    fn partial_sorting_scan_left_to_right_8u_block_omp_wraps_sequential_version() {
17031        let t = vec![2_u8, 1, 3, 1, 0];
17032        let mut sa = vec![2 | SAINT_MIN, 4, 0, 0, 0, 0];
17033        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
17034        let mut thread_state = alloc_thread_state(2).unwrap();
17035        let d = partial_sorting_scan_left_to_right_8u_block_omp(
17036            &t,
17037            &mut sa,
17038            4,
17039            &mut buckets,
17040            0,
17041            0,
17042            2,
17043            2,
17044            &mut thread_state,
17045        );
17046        assert!(d >= 0);
17047    }
17048
17049    #[test]
17050    fn partial_sorting_shift_markers_8u_omp_toggles_segment_markers() {
17051        let mut sa = vec![1 | SAINT_MIN, 2 | SAINT_MIN, 3, 4 | SAINT_MIN, 5];
17052        let mut buckets = vec![0; 6 * ALPHABET_SIZE];
17053        buckets[4 * ALPHABET_SIZE + buckets_index2(1, 0)] = 5;
17054        buckets[buckets_index2(0, 0)] = 0;
17055        let len = sa.len() as SaSint;
17056        partial_sorting_shift_markers_8u_omp(&mut sa, len, &buckets, 1);
17057        assert!(sa.iter().any(|&v| (v & SAINT_MIN) == 0));
17058    }
17059
17060    #[test]
17061    fn partial_sorting_shift_markers_32s_6k_omp_toggles_segment_markers() {
17062        let mut sa = vec![1 | SAINT_MIN, 2 | SAINT_MIN, 3, 4 | SAINT_MIN, 5];
17063        let k = 3usize;
17064        let mut buckets = vec![0; 6 * k];
17065        buckets[buckets_index4(1, 0)] = 5;
17066        buckets[4 * k + buckets_index2(0, 0)] = 0;
17067        partial_sorting_shift_markers_32s_6k_omp(&mut sa, k as SaSint, &buckets, 1);
17068        assert!(sa.iter().any(|&v| (v & SAINT_MIN) == 0));
17069    }
17070
17071    #[test]
17072    fn partial_sorting_shift_markers_32s_4k_toggles_group_markers() {
17073        let mut sa = vec![
17074            1 | SUFFIX_GROUP_MARKER,
17075            2 | SUFFIX_GROUP_MARKER,
17076            3,
17077            4 | SUFFIX_GROUP_MARKER,
17078        ];
17079        let len = sa.len() as SaSint;
17080        partial_sorting_shift_markers_32s_4k(&mut sa, len);
17081        assert!(sa.iter().any(|&v| (v & SUFFIX_GROUP_MARKER) == 0));
17082    }
17083
17084    #[test]
17085    fn partial_sorting_shift_buckets_32s_6k_moves_temp_bucket_view_into_main_slots() {
17086        let k = 3usize;
17087        let mut buckets = vec![0; 6 * k];
17088        buckets[4 * k + 0] = 10;
17089        buckets[4 * k + 1] = 11;
17090        buckets[4 * k + 2] = 12;
17091        buckets[4 * k + 3] = 13;
17092        partial_sorting_shift_buckets_32s_6k(k as SaSint, &mut buckets);
17093        assert_eq!(buckets[0], 10);
17094        assert_eq!(buckets[1], 11);
17095        assert_eq!(buckets[4], 12);
17096        assert_eq!(buckets[5], 13);
17097    }
17098
17099    #[test]
17100    fn partial_sorting_scan_right_to_left_8u_emits_induced_suffixes() {
17101        let t = vec![0_u8, 1, 2, 1, 0];
17102        let mut sa = vec![0, 0, 4 | SAINT_MIN];
17103        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
17104        buckets[buckets_index2(1, 1)] = 2;
17105
17106        let d = partial_sorting_scan_right_to_left_8u(&t, &mut sa, &mut buckets, 0, 2, 1);
17107
17108        assert_eq!(d, 1);
17109        assert_eq!(sa[1], 3 | SAINT_MIN);
17110        assert_eq!(buckets[buckets_index2(1, 1)], 1);
17111        assert_eq!(buckets[2 * ALPHABET_SIZE + buckets_index2(1, 1)], 1);
17112    }
17113
17114    #[test]
17115    fn partial_gsa_scan_right_to_left_8u_skips_separator_bucket() {
17116        let t = vec![1_u8, 0, 0];
17117        let mut sa = vec![0, 2 | SAINT_MIN];
17118        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
17119        buckets[buckets_index2(0, 1)] = 2;
17120
17121        let d = partial_gsa_scan_right_to_left_8u(&t, &mut sa, &mut buckets, 0, 1, 1);
17122
17123        assert_eq!(d, 1);
17124        assert_eq!(sa, vec![0, 2 | SAINT_MIN]);
17125        assert_eq!(buckets[buckets_index2(0, 1)], 2);
17126    }
17127
17128    #[test]
17129    fn partial_sorting_scan_right_to_left_32s_6k_emits_induced_suffixes() {
17130        let t = vec![0, 1, 2, 1, 0];
17131        let mut sa = vec![0, 0, 4 | SAINT_MIN];
17132        let mut buckets = vec![0; 4 * 3];
17133        buckets[buckets_index4(1, 1)] = 2;
17134
17135        let d = partial_sorting_scan_right_to_left_32s_6k(&t, &mut sa, &mut buckets, 0, 2, 1);
17136
17137        assert_eq!(d, 1);
17138        assert_eq!(sa[1], 3 | SAINT_MIN);
17139        assert_eq!(buckets[buckets_index4(1, 1)], 1);
17140        assert_eq!(buckets[buckets_index4(1, 1) + 2], 1);
17141    }
17142
17143    #[test]
17144    fn partial_sorting_scan_right_to_left_32s_1k_omp_wraps_sequential_version() {
17145        let t = vec![0, 1, 2, 1, 0];
17146        let mut sa = vec![0, 0, 4];
17147        let mut buckets = vec![0; 3];
17148        buckets[1] = 2;
17149        let mut thread_state = alloc_thread_state(2).unwrap();
17150
17151        partial_sorting_scan_right_to_left_32s_1k_omp(
17152            &t,
17153            &mut sa,
17154            3,
17155            &mut buckets,
17156            2,
17157            &mut thread_state,
17158        );
17159
17160        assert_eq!(sa[1], 3 | SAINT_MIN);
17161        assert_eq!(buckets[1], 1);
17162    }
17163
17164    #[test]
17165    fn partial_sorting_scan_right_to_left_32s_6k_block_gather_records_symbols() {
17166        let t = vec![0, 1, 2, 1, 0];
17167        let sa = vec![0, 4 | SAINT_MIN, 0];
17168        let mut cache = vec![ThreadCache::default(); sa.len()];
17169
17170        partial_sorting_scan_right_to_left_32s_6k_block_gather(&t, &sa, &mut cache, 1, 1);
17171
17172        assert_eq!(cache[0].index, 4 | SAINT_MIN);
17173        assert_eq!(cache[0].symbol, buckets_index4(1, 1) as SaSint);
17174    }
17175
17176    #[test]
17177    fn partial_sorting_scan_right_to_left_32s_4k_block_gather_zeroes_positive_entries() {
17178        let t = vec![0, 1, 2, 1, 0];
17179        let mut sa = vec![0, 4 | SUFFIX_GROUP_MARKER, 0];
17180        let mut cache = vec![ThreadCache::default(); sa.len()];
17181
17182        partial_sorting_scan_right_to_left_32s_4k_block_gather(&t, &mut sa, &mut cache, 1, 1);
17183
17184        assert_eq!(sa[1], 0);
17185        assert_eq!(cache[0].index, 4 | SUFFIX_GROUP_MARKER);
17186        assert_eq!(cache[0].symbol, buckets_index2(1, 1) as SaSint);
17187    }
17188
17189    #[test]
17190    fn partial_sorting_scan_right_to_left_32s_1k_block_gather_stores_preinduced_entries() {
17191        let t = vec![0, 1, 2, 1, 0];
17192        let mut sa = vec![0, 4, 0];
17193        let mut cache = vec![ThreadCache::default(); sa.len()];
17194
17195        partial_sorting_scan_right_to_left_32s_1k_block_gather(&t, &mut sa, &mut cache, 1, 1);
17196
17197        assert_eq!(sa[1], 0);
17198        assert_eq!(cache[0].index, 3 | SAINT_MIN);
17199        assert_eq!(cache[0].symbol, 1);
17200    }
17201
17202    #[test]
17203    fn partial_sorting_scan_right_to_left_32s_6k_block_sort_updates_bucket_and_marker_state() {
17204        let t = vec![0, 1, 2, 1, 0];
17205        let mut cache = vec![ThreadCache::default(); 3];
17206        cache[0].index = 4 | SAINT_MIN;
17207        cache[0].symbol = buckets_index4(1, 1) as SaSint;
17208        let mut buckets = vec![0; 4 * 3];
17209        buckets[buckets_index4(1, 1)] = 2;
17210
17211        let d = partial_sorting_scan_right_to_left_32s_6k_block_sort(
17212            &t,
17213            &mut buckets,
17214            0,
17215            &mut cache,
17216            1,
17217            1,
17218        );
17219
17220        assert_eq!(d, 1);
17221        assert_eq!(cache[0].index, 3 | SAINT_MIN);
17222        assert_eq!(buckets[buckets_index4(1, 1)], 1);
17223        assert_eq!(buckets[buckets_index4(1, 1) + 2], 1);
17224    }
17225
17226    #[test]
17227    fn partial_sorting_scan_right_to_left_32s_1k_block_omp_places_cached_suffixes() {
17228        let t = vec![0, 1, 2, 1, 0];
17229        let mut sa = vec![0, 4, 0];
17230        let mut buckets = vec![0; 3];
17231        buckets[1] = 2;
17232        let mut cache = vec![ThreadCache::default(); sa.len()];
17233
17234        partial_sorting_scan_right_to_left_32s_1k_block_omp(
17235            &t,
17236            &mut sa,
17237            &mut buckets,
17238            &mut cache,
17239            1,
17240            1,
17241            2,
17242        );
17243
17244        assert_eq!(sa[1], 3 | SAINT_MIN);
17245        assert_eq!(buckets[1], 1);
17246    }
17247
17248    #[test]
17249    fn partial_sorting_scan_right_to_left_32s_1k_block_omp_uses_relative_cache() {
17250        let block_start = 20_000usize;
17251        let block_size = 16_384usize;
17252        let n = block_start + block_size + 8;
17253        let t = vec![1; n];
17254        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
17255
17256        let mut sa_single = vec![0; n];
17257        sa_single[block_start..block_start + block_size].copy_from_slice(&suffixes);
17258        let mut sa_threaded = sa_single.clone();
17259        let mut bucket_single = vec![0, block_size as SaSint];
17260        let mut bucket_threaded = bucket_single.clone();
17261        let mut cache = vec![ThreadCache::default(); 4 * LIBSAIS_PER_THREAD_CACHE_SIZE];
17262
17263        partial_sorting_scan_right_to_left_32s_1k(
17264            &t,
17265            &mut sa_single,
17266            &mut bucket_single,
17267            block_start as FastSint,
17268            block_size as FastSint,
17269        );
17270        partial_sorting_scan_right_to_left_32s_1k_block_omp(
17271            &t,
17272            &mut sa_threaded,
17273            &mut bucket_threaded,
17274            &mut cache,
17275            block_start as FastSint,
17276            block_size as FastSint,
17277            4,
17278        );
17279
17280        assert_eq!(sa_threaded, sa_single);
17281        assert_eq!(bucket_threaded, bucket_single);
17282    }
17283
17284    #[test]
17285    fn partial_sorting_gather_lms_suffixes_32s_4k_compacts_negative_marked_entries() {
17286        let mut sa = vec![1 | SUFFIX_GROUP_MARKER, -3, 5 | SUFFIX_GROUP_MARKER, -7];
17287        let n = sa.len() as FastSint;
17288
17289        let l = partial_sorting_gather_lms_suffixes_32s_4k(&mut sa, 0, n);
17290
17291        assert_eq!(l, 2);
17292        assert_eq!(sa[0], -1073741827);
17293        assert_eq!(sa[1], -1073741831);
17294    }
17295
17296    #[test]
17297    fn partial_sorting_gather_lms_suffixes_32s_1k_compacts_negative_marked_entries() {
17298        let mut sa = vec![1, -3, 5, -7];
17299        let n = sa.len() as FastSint;
17300
17301        let l = partial_sorting_gather_lms_suffixes_32s_1k(&mut sa, 0, n);
17302
17303        assert_eq!(l, 2);
17304        assert_eq!(sa[0], SAINT_MAX - 2);
17305        assert_eq!(sa[1], SAINT_MAX - 6);
17306    }
17307
17308    #[test]
17309    fn partial_sorting_gather_lms_suffixes_32s_4k_omp_wraps_sequential_version() {
17310        let mut sa = vec![1 | SUFFIX_GROUP_MARKER, -3, 5 | SUFFIX_GROUP_MARKER, -7];
17311        let mut thread_state = alloc_thread_state(2).unwrap();
17312
17313        partial_sorting_gather_lms_suffixes_32s_4k_omp(&mut sa, 4, 2, &mut thread_state);
17314
17315        assert_eq!(sa[0], -1073741827);
17316        assert_eq!(sa[1], -1073741831);
17317    }
17318
17319    #[test]
17320    fn partial_sorting_gather_lms_suffixes_32s_1k_omp_wraps_sequential_version() {
17321        let mut sa = vec![1, -3, 5, -7];
17322        let mut thread_state = alloc_thread_state(2).unwrap();
17323
17324        partial_sorting_gather_lms_suffixes_32s_1k_omp(&mut sa, 4, 2, &mut thread_state);
17325
17326        assert_eq!(sa[0], SAINT_MAX - 2);
17327        assert_eq!(sa[1], SAINT_MAX - 6);
17328    }
17329
17330    #[test]
17331    fn partial_sorting_gather_lms_suffixes_32s_omp_uses_block_partition() {
17332        let n = 65_600usize;
17333        let input_4k: Vec<SaSint> = (0..n)
17334            .map(|i| {
17335                let value = (i as SaSint) | SUFFIX_GROUP_MARKER;
17336                if i % 5 == 0 {
17337                    value | SAINT_MIN
17338                } else {
17339                    value
17340                }
17341            })
17342            .collect();
17343        let count_4k = input_4k.iter().filter(|&&value| value < 0).count();
17344
17345        let mut single = input_4k.clone();
17346        let mut threaded = input_4k;
17347        let mut thread_state = alloc_thread_state(4).unwrap();
17348        partial_sorting_gather_lms_suffixes_32s_4k_omp(&mut single, n as SaSint, 1, &mut []);
17349        partial_sorting_gather_lms_suffixes_32s_4k_omp(
17350            &mut threaded,
17351            n as SaSint,
17352            4,
17353            &mut thread_state,
17354        );
17355        assert_eq!(&threaded[..count_4k], &single[..count_4k]);
17356
17357        let input_1k: Vec<SaSint> = (0..n)
17358            .map(|i| {
17359                let value = i as SaSint;
17360                if i % 7 == 0 {
17361                    value | SAINT_MIN
17362                } else {
17363                    value
17364                }
17365            })
17366            .collect();
17367        let count_1k = input_1k.iter().filter(|&&value| value < 0).count();
17368
17369        let mut single = input_1k.clone();
17370        let mut threaded = input_1k;
17371        partial_sorting_gather_lms_suffixes_32s_1k_omp(&mut single, n as SaSint, 1, &mut []);
17372        partial_sorting_gather_lms_suffixes_32s_1k_omp(
17373            &mut threaded,
17374            n as SaSint,
17375            4,
17376            &mut thread_state,
17377        );
17378        assert_eq!(&threaded[..count_1k], &single[..count_1k]);
17379    }
17380
17381    #[test]
17382    fn renumber_lms_suffixes_8u_writes_names_into_second_half() {
17383        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17384
17385        let name = renumber_lms_suffixes_8u(&mut sa, 2, 0, 0, 2);
17386
17387        assert_eq!(name, 1);
17388        assert_eq!(sa[2], SAINT_MIN);
17389        assert_eq!(sa[3], SAINT_MIN | 1);
17390    }
17391
17392    #[test]
17393    fn renumber_lms_suffixes_8u_matches_upstream_c_helper() {
17394        let mut sa_rust = vec![1 | SAINT_MIN, 3, 0, 0];
17395        let mut sa_c = sa_rust.clone();
17396
17397        let rust_name = renumber_lms_suffixes_8u(&mut sa_rust, 2, 0, 0, 2);
17398        let c_name = unsafe { probe_renumber_lms_suffixes_8u(sa_c.as_mut_ptr(), 2, 0, 0, 2) };
17399
17400        assert_eq!(rust_name, c_name);
17401        assert_eq!(sa_rust, sa_c);
17402    }
17403
17404    #[test]
17405    fn gather_marked_lms_suffixes_moves_negative_marked_entries_to_tail() {
17406        let mut sa = vec![0, 0, 1 | SAINT_MIN, 3];
17407
17408        let l = gather_marked_lms_suffixes(&mut sa, 2, 4, 0, 2);
17409
17410        assert_eq!(l, 3);
17411        assert_eq!(sa[3], 1);
17412    }
17413
17414    #[test]
17415    fn gather_marked_lms_suffixes_matches_upstream_c_helper() {
17416        let mut sa_rust = vec![0, 0, 1 | SAINT_MIN, 3];
17417        let mut sa_c = sa_rust.clone();
17418
17419        let rust_l = gather_marked_lms_suffixes(&mut sa_rust, 2, 4, 0, 2);
17420        let c_l = unsafe { probe_gather_marked_lms_suffixes(sa_c.as_mut_ptr(), 2, 4, 0, 2) };
17421
17422        assert_eq!(rust_l, c_l);
17423        assert_eq!(sa_rust, sa_c);
17424    }
17425
17426    #[test]
17427    fn renumber_lms_suffixes_8u_omp_wraps_sequential_version() {
17428        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17429        let mut thread_state = alloc_thread_state(2).unwrap();
17430
17431        let name = renumber_lms_suffixes_8u_omp(&mut sa, 2, 2, &mut thread_state);
17432
17433        assert_eq!(name, 1);
17434        assert_eq!(sa[2], SAINT_MIN);
17435    }
17436
17437    #[test]
17438    fn renumber_lms_suffixes_8u_omp_uses_block_partition_for_large_inputs() {
17439        let m = 65_600usize;
17440        let mut input = vec![0; 2 * m];
17441        for (i, slot) in input[..m].iter_mut().enumerate() {
17442            let suffix = (2 * i + 1) as SaSint;
17443            *slot = if i % 5 == 0 {
17444                suffix | SAINT_MIN
17445            } else {
17446                suffix
17447            };
17448        }
17449
17450        let mut single = input.clone();
17451        let mut threaded = input;
17452        let mut thread_state = alloc_thread_state(4).unwrap();
17453        let single_name = renumber_lms_suffixes_8u(&mut single, m as SaSint, 0, 0, m as FastSint);
17454        let threaded_name =
17455            renumber_lms_suffixes_8u_omp(&mut threaded, m as SaSint, 4, &mut thread_state);
17456
17457        assert_eq!(threaded_name, single_name);
17458        assert_eq!(threaded, single);
17459    }
17460
17461    #[test]
17462    fn gather_marked_lms_suffixes_omp_uses_block_partition_for_large_inputs() {
17463        let n = 131_200usize;
17464        let half_n = n >> 1;
17465        let mut input = vec![-77; n];
17466        for (i, slot) in input[..half_n].iter_mut().enumerate() {
17467            let suffix = (3 * i + 1) as SaSint;
17468            *slot = if i % 7 == 0 {
17469                suffix | SAINT_MIN
17470            } else {
17471                suffix
17472            };
17473        }
17474        let marked_count = input[..half_n].iter().filter(|&&value| value < 0).count();
17475
17476        let mut single = input.clone();
17477        let mut threaded = input;
17478        let mut thread_state = alloc_thread_state(4).unwrap();
17479        let _ = gather_marked_lms_suffixes(&mut single, 0, n as FastSint, 0, half_n as FastSint);
17480        gather_marked_lms_suffixes_omp(&mut threaded, n as SaSint, 0, 0, 4, &mut thread_state);
17481
17482        assert_eq!(&threaded[n - marked_count..], &single[n - marked_count..]);
17483    }
17484
17485    #[test]
17486    fn renumber_and_gather_lms_suffixes_omp_uses_large_input_paths() {
17487        let m = 65_600usize;
17488        let n = 2 * m;
17489        let mut input = vec![0; n];
17490        for (i, slot) in input[..m].iter_mut().enumerate() {
17491            let suffix = (2 * i + 1) as SaSint;
17492            *slot = if i % 5 == 0 {
17493                suffix | SAINT_MIN
17494            } else {
17495                suffix
17496            };
17497        }
17498
17499        let mut single = input.clone();
17500        let mut threaded = input;
17501        let mut single_state = alloc_thread_state(1).unwrap();
17502        let mut threaded_state = alloc_thread_state(4).unwrap();
17503        let single_name = renumber_and_gather_lms_suffixes_omp(
17504            &mut single,
17505            n as SaSint,
17506            m as SaSint,
17507            0,
17508            1,
17509            &mut single_state,
17510        );
17511        let threaded_name = renumber_and_gather_lms_suffixes_omp(
17512            &mut threaded,
17513            n as SaSint,
17514            m as SaSint,
17515            0,
17516            4,
17517            &mut threaded_state,
17518        );
17519
17520        assert_eq!(threaded_name, single_name);
17521        assert_eq!(threaded, single);
17522    }
17523
17524    #[test]
17525    fn renumber_and_gather_lms_suffixes_omp_gathers_when_names_are_not_distinct() {
17526        let mut sa = vec![1 | SAINT_MIN, 3, 0, 0];
17527        let mut thread_state = alloc_thread_state(2).unwrap();
17528
17529        let name = renumber_and_gather_lms_suffixes_omp(&mut sa, 4, 2, 0, 2, &mut thread_state);
17530
17531        assert_eq!(name, 1);
17532        assert_eq!(sa[3], 1);
17533    }
17534
17535    #[test]
17536    fn renumber_and_gather_lms_suffixes_omp_matches_upstream_c_helper() {
17537        let mut sa_rust = vec![1 | SAINT_MIN, 3, 0, 0];
17538        let mut sa_c = sa_rust.clone();
17539        let mut thread_state = alloc_thread_state(2).unwrap();
17540
17541        let rust_name =
17542            renumber_and_gather_lms_suffixes_omp(&mut sa_rust, 4, 2, 0, 2, &mut thread_state);
17543        let c_name =
17544            unsafe { probe_renumber_and_gather_lms_suffixes_omp(sa_c.as_mut_ptr(), 4, 2, 0, 2) };
17545
17546        assert_eq!(rust_name, c_name);
17547        assert_eq!(sa_rust, sa_c);
17548    }
17549
17550    #[test]
17551    fn renumber_distinct_lms_suffixes_32s_4k_masks_sources_and_writes_second_half() {
17552        let mut sa = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17553
17554        let name = renumber_distinct_lms_suffixes_32s_4k(&mut sa, 2, 1, 0, 2);
17555
17556        assert_eq!(name, 3);
17557        assert_eq!(sa[0], 1);
17558        assert_eq!(sa[1], 3);
17559        assert_eq!(sa[2], 1);
17560        assert_eq!(sa[3], 2 | SAINT_MIN);
17561    }
17562
17563    #[test]
17564    fn renumber_distinct_lms_suffixes_32s_4k_matches_upstream_c_helper() {
17565        let mut sa_rust = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17566        let mut sa_c = sa_rust.clone();
17567
17568        let rust_name = renumber_distinct_lms_suffixes_32s_4k(&mut sa_rust, 2, 1, 0, 2);
17569        let c_name =
17570            unsafe { probe_renumber_distinct_lms_suffixes_32s_4k(sa_c.as_mut_ptr(), 2, 1, 0, 2) };
17571
17572        assert_eq!(rust_name, c_name);
17573        assert_eq!(sa_rust, sa_c);
17574    }
17575
17576    #[test]
17577    fn mark_distinct_lms_suffixes_32s_propagates_previous_nonzero_marker() {
17578        let mut sa = vec![0, 0, SAINT_MIN | 5, 0, SAINT_MIN | 7];
17579
17580        mark_distinct_lms_suffixes_32s(&mut sa, 2, 0, 3);
17581
17582        assert_eq!(sa[2], 5);
17583        assert_eq!(sa[3], 0);
17584        assert_eq!(sa[4], SAINT_MIN | 7);
17585    }
17586
17587    #[test]
17588    fn clamp_lms_suffixes_length_32s_keeps_only_negative_lengths() {
17589        let mut sa = vec![0, 0, SAINT_MIN | 5, 7, SAINT_MIN | 3];
17590
17591        clamp_lms_suffixes_length_32s(&mut sa, 2, 0, 3);
17592
17593        assert_eq!(sa[2], 5);
17594        assert_eq!(sa[3], 0);
17595        assert_eq!(sa[4], 3);
17596    }
17597
17598    #[test]
17599    fn renumber_and_mark_distinct_lms_suffixes_32s_4k_omp_marks_second_half_when_names_repeat() {
17600        let mut sa = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17601        let mut thread_state = alloc_thread_state(2).unwrap();
17602
17603        let name =
17604            renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(&mut sa, 4, 2, 2, &mut thread_state);
17605
17606        assert_eq!(name, 2);
17607        assert_eq!(sa[2], 1);
17608        assert_eq!(sa[3], SAINT_MIN | 2);
17609    }
17610
17611    #[test]
17612    fn renumber_and_mark_distinct_lms_suffixes_32s_4k_omp_matches_upstream_c_helper() {
17613        let mut sa_rust = vec![1 | SAINT_MIN, 3 | SAINT_MIN, 0, 0];
17614        let mut sa_c = sa_rust.clone();
17615        let mut thread_state = alloc_thread_state(2).unwrap();
17616
17617        let rust_name = renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
17618            &mut sa_rust,
17619            4,
17620            2,
17621            2,
17622            &mut thread_state,
17623        );
17624        let c_name = unsafe {
17625            probe_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_c.as_mut_ptr(), 4, 2, 2)
17626        };
17627
17628        assert_eq!(rust_name, c_name);
17629        assert_eq!(sa_rust, sa_c);
17630    }
17631
17632    #[test]
17633    fn reconstruct_lms_suffixes_maps_indices_from_tail_interval() {
17634        let mut sa = vec![0, 1, 2, 7, 11, 13];
17635
17636        reconstruct_lms_suffixes(&mut sa, 6, 3, 0, 3);
17637
17638        assert_eq!(&sa[..3], &[7, 11, 13]);
17639    }
17640
17641    #[test]
17642    fn reconstruct_lms_suffixes_omp_wraps_sequential_version() {
17643        let mut sa = vec![0, 1, 2, 7, 11, 13];
17644
17645        reconstruct_lms_suffixes_omp(&mut sa, 6, 3, 2);
17646
17647        assert_eq!(&sa[..3], &[7, 11, 13]);
17648    }
17649
17650    #[test]
17651    fn reconstruct_lms_suffixes_omp_uses_block_partition_for_large_inputs() {
17652        let m = 65_600usize;
17653        let n = 2 * m;
17654        let mut input = vec![0; n];
17655        for (i, slot) in input[..m].iter_mut().enumerate() {
17656            *slot = (m - 1 - i) as SaSint;
17657        }
17658        for (i, slot) in input[m..].iter_mut().enumerate() {
17659            *slot = (i * 17 + 3) as SaSint;
17660        }
17661
17662        let mut single = input.clone();
17663        let mut threaded = input;
17664        reconstruct_lms_suffixes(&mut single, n as SaSint, m as SaSint, 0, m as FastSint);
17665        reconstruct_lms_suffixes_omp(&mut threaded, n as SaSint, m as SaSint, 4);
17666
17667        assert_eq!(threaded, single);
17668    }
17669
17670    #[test]
17671    fn renumber_and_mark_distinct_lms_suffixes_32s_1k_omp_handles_single_lms_suffix() {
17672        let t = vec![2, 1, 0];
17673        let mut sa = vec![0; t.len()];
17674
17675        let name = renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(&t, &mut sa, 3, 1, 1);
17676
17677        assert_eq!(name, 1);
17678        assert_eq!(sa[1], SAINT_MIN | 1);
17679    }
17680
17681    #[test]
17682    fn libsais_main_32s_entry_matches_upstream_c_on_6k_branch() {
17683        assert_main_32s_entry_matches_upstream_c_for_branch(300);
17684    }
17685
17686    #[test]
17687    fn libsais_main_32s_entry_matches_upstream_c_on_4k_branch() {
17688        assert_main_32s_entry_matches_upstream_c_for_branch(400);
17689    }
17690
17691    #[test]
17692    fn libsais_main_32s_entry_matches_upstream_c_on_2k_branch() {
17693        assert_main_32s_entry_matches_upstream_c_for_branch(700);
17694    }
17695
17696    #[test]
17697    fn libsais_main_32s_entry_matches_upstream_c_on_1k_branch() {
17698        assert_main_32s_entry_matches_upstream_c_for_branch(1501);
17699    }
17700
17701    #[test]
17702    fn libsais_main_32s_entry_matches_upstream_c_on_recursive_repetitive_6k_case() {
17703        assert_main_32s_entry_matches_upstream_c(make_recursive_main_32s_text(24), 300, 0, true);
17704    }
17705
17706    #[test]
17707    fn libsais_main_32s_entry_matches_upstream_c_on_recursive_repetitive_1k_case() {
17708        assert_main_32s_entry_matches_upstream_c(make_recursive_main_32s_text(24), 1501, 0, true);
17709    }
17710
17711    #[test]
17712    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_6k_case() {
17713        assert_main_32s_entry_matches_upstream_c(
17714            make_large_main_32s_stress_text(1024, 300),
17715            300,
17716            0,
17717            true,
17718        );
17719    }
17720
17721    #[test]
17722    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_6k_case_with_fs() {
17723        assert_main_32s_entry_matches_upstream_c(
17724            make_large_main_32s_stress_text(1024, 300),
17725            300,
17726            2048,
17727            false,
17728        );
17729    }
17730
17731    #[test]
17732    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_4k_case() {
17733        assert_main_32s_entry_matches_upstream_c(
17734            make_large_main_32s_stress_text(1024, 400),
17735            400,
17736            0,
17737            true,
17738        );
17739    }
17740
17741    #[test]
17742    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_4k_case_with_fs() {
17743        assert_main_32s_entry_matches_upstream_c(
17744            make_large_main_32s_stress_text(1024, 400),
17745            400,
17746            2048,
17747            false,
17748        );
17749    }
17750
17751    #[test]
17752    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_2k_case() {
17753        assert_main_32s_entry_matches_upstream_c(
17754            make_large_main_32s_stress_text(1024, 700),
17755            700,
17756            0,
17757            true,
17758        );
17759    }
17760
17761    #[test]
17762    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_2k_case_with_fs() {
17763        assert_main_32s_entry_matches_upstream_c(
17764            make_large_main_32s_stress_text(1024, 700),
17765            700,
17766            2048,
17767            false,
17768        );
17769    }
17770
17771    #[test]
17772    fn libsais_main_32s_entry_matches_upstream_c_on_large_generated_1k_case_with_fs() {
17773        assert_main_32s_entry_matches_upstream_c(
17774            make_large_main_32s_stress_text(1024, 1501),
17775            1501,
17776            2048,
17777            false,
17778        );
17779    }
17780
17781    #[test]
17782    fn place_lms_suffixes_interval_32s_4k_moves_suffixes_into_bucket_intervals() {
17783        let mut sa = vec![10, 11, 12, 13, 14];
17784        let k = 3usize;
17785        let mut buckets = vec![0; 4 * k];
17786        buckets[buckets_index2(0, 1)] = 0;
17787        buckets[buckets_index2(1, 1)] = 2;
17788        buckets[buckets_index2(2, 1)] = 3;
17789        buckets[3 * k] = 2;
17790        buckets[3 * k + 1] = 5;
17791
17792        place_lms_suffixes_interval_32s_4k(&mut sa, 5, k as SaSint, 5, &buckets);
17793
17794        assert_eq!(sa, vec![0, 0, 0, 0, 14]);
17795    }
17796
17797    #[test]
17798    fn place_lms_suffixes_interval_32s_2k_moves_suffixes_into_bucket_intervals() {
17799        let mut sa = vec![10, 11, 12, 13, 14];
17800        let mut buckets = vec![0; 2 * 3];
17801        buckets[buckets_index2(0, 0)] = 2;
17802        buckets[buckets_index2(0, 1)] = 0;
17803        buckets[buckets_index2(1, 0)] = 5;
17804        buckets[buckets_index2(1, 1)] = 2;
17805        buckets[buckets_index2(2, 0)] = 5;
17806        buckets[buckets_index2(2, 1)] = 3;
17807
17808        place_lms_suffixes_interval_32s_2k(&mut sa, 5, 3, 5, &buckets);
17809
17810        assert_eq!(sa, vec![0, 0, 0, 0, 14]);
17811    }
17812
17813    #[test]
17814    fn place_lms_suffixes_interval_32s_1k_places_suffixes_by_symbol_bucket() {
17815        let t = vec![0, 1, 1, 2, 2];
17816        let mut sa = vec![1, 2, 3, 4, 99];
17817        let buckets = vec![0, 2, 5];
17818
17819        place_lms_suffixes_interval_32s_1k(&t, &mut sa, 3, 4, &buckets);
17820
17821        assert_eq!(sa, vec![1, 2, 0, 3, 4]);
17822    }
17823
17824    #[test]
17825    fn final_bwt_scan_left_to_right_8u_rewrites_sa_and_induces_suffixes() {
17826        let t = vec![0_u8, 1, 2, 1, 0];
17827        let mut sa = vec![1, 0, 0];
17828        let mut induction_bucket = vec![0, 1, 3];
17829
17830        final_bwt_scan_left_to_right_8u(&t, &mut sa, &mut induction_bucket, 0, 1);
17831
17832        assert_eq!(sa[0], 0);
17833        assert_eq!(induction_bucket[0], 1);
17834    }
17835
17836    #[test]
17837    fn final_bwt_aux_scan_left_to_right_8u_updates_sampling_array() {
17838        let t = vec![0_u8, 1, 2, 1, 0];
17839        let mut sa = vec![1, 0, 0];
17840        let mut induction_bucket = vec![0, 1, 3];
17841        let mut i_out = vec![0; 2];
17842
17843        final_bwt_aux_scan_left_to_right_8u(
17844            &t,
17845            &mut sa,
17846            0,
17847            &mut i_out,
17848            &mut induction_bucket,
17849            0,
17850            1,
17851        );
17852
17853        assert_eq!(i_out[0], 1);
17854    }
17855
17856    #[test]
17857    fn final_sorting_scan_left_to_right_8u_clears_marker_and_places_suffix() {
17858        let t = vec![0_u8, 1, 2, 1, 0];
17859        let mut sa = vec![1, 0, 0];
17860        let mut induction_bucket = vec![0, 1, 3];
17861
17862        final_sorting_scan_left_to_right_8u(&t, &mut sa, &mut induction_bucket, 0, 1);
17863
17864        assert_eq!(sa[0], 0);
17865        assert_eq!(induction_bucket[0], 1);
17866    }
17867
17868    #[test]
17869    fn final_sorting_scan_left_to_right_32s_clears_marker_and_places_suffix() {
17870        let t = vec![0, 1, 2, 1, 0];
17871        let mut sa = vec![1, 0, 0];
17872        let mut induction_bucket = vec![0, 1, 3];
17873
17874        final_sorting_scan_left_to_right_32s(&t, &mut sa, &mut induction_bucket, 0, 1);
17875
17876        assert_eq!(sa[0], 0);
17877        assert_eq!(induction_bucket[0], 1);
17878    }
17879
17880    #[test]
17881    fn final_bwt_scan_left_to_right_8u_block_prepare_records_cache_and_counts() {
17882        let t = vec![0_u8, 1, 2, 1, 0];
17883        let mut sa = vec![1, 2, 0];
17884        let mut buckets = vec![99; ALPHABET_SIZE];
17885        let mut cache = vec![ThreadCache::default(); 4];
17886
17887        let count = final_bwt_scan_left_to_right_8u_block_prepare(
17888            &t,
17889            &mut sa,
17890            ALPHABET_SIZE as SaSint,
17891            &mut buckets,
17892            &mut cache,
17893            0,
17894            2,
17895        );
17896
17897        assert_eq!(count, 2);
17898        assert_eq!(sa[0] & SAINT_MAX, 0);
17899        assert_eq!(sa[1], 1 | SAINT_MIN);
17900        assert_eq!(buckets[0], 1);
17901        assert_eq!(buckets[1], 1);
17902        assert_eq!(cache[0].symbol, 0);
17903        assert_eq!(cache[0].index & SAINT_MAX, 0);
17904        assert_eq!(cache[1].symbol, 1);
17905        assert_eq!(cache[1].index & SAINT_MAX, 1);
17906    }
17907
17908    #[test]
17909    fn final_sorting_scan_left_to_right_32s_block_omp_places_cached_suffixes() {
17910        let t = vec![0, 1, 2, 1, 0];
17911        let mut sa = vec![1, 2, 0, 0];
17912        let mut induction_bucket = vec![0, 1, 3];
17913        let mut cache = vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE];
17914
17915        final_sorting_scan_left_to_right_32s_block_omp(
17916            &t,
17917            &mut sa,
17918            &mut induction_bucket,
17919            &mut cache,
17920            0,
17921            2,
17922            2,
17923        );
17924
17925        assert_eq!(sa[0] & SAINT_MAX, 0);
17926        assert_eq!(sa[1] & SAINT_MAX, 1);
17927        assert_eq!(induction_bucket[0], 1);
17928        assert_eq!(induction_bucket[1], 2);
17929    }
17930
17931    #[test]
17932    fn final_sorting_scan_left_to_right_8u_omp_wraps_sequential_behavior() {
17933        let t = vec![0_u8, 1, 2, 1, 0];
17934        let mut sa = vec![0; t.len()];
17935        let mut induction_bucket = vec![0, 1, 3];
17936        let mut expected_sa = sa.clone();
17937        let mut expected_bucket = induction_bucket.clone();
17938
17939        final_sorting_scan_left_to_right_8u_omp(
17940            &t,
17941            &mut expected_sa,
17942            t.len() as FastSint,
17943            ALPHABET_SIZE as SaSint,
17944            &mut expected_bucket,
17945            1,
17946            &mut [],
17947        );
17948
17949        let mut thread_state = alloc_thread_state(2).unwrap();
17950
17951        final_sorting_scan_left_to_right_8u_omp(
17952            &t,
17953            &mut sa,
17954            t.len() as FastSint,
17955            ALPHABET_SIZE as SaSint,
17956            &mut induction_bucket,
17957            2,
17958            &mut thread_state,
17959        );
17960
17961        assert_eq!(sa, expected_sa);
17962        assert_eq!(induction_bucket, expected_bucket);
17963    }
17964
17965    #[test]
17966    fn final_sorting_scan_left_to_right_8u_block_omp_uses_thread_buckets() {
17967        let block_start = 20_000usize;
17968        let block_size = 16_384usize;
17969        let n = block_start + block_size + 8;
17970        let t = vec![1_u8; n];
17971        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
17972
17973        let mut expected_sa = vec![0; n];
17974        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
17975        let mut threaded_sa = expected_sa.clone();
17976        let mut expected_bucket = vec![0; ALPHABET_SIZE];
17977        let mut threaded_bucket = expected_bucket.clone();
17978        let mut thread_state = alloc_thread_state(4).unwrap();
17979
17980        final_sorting_scan_left_to_right_8u(
17981            &t,
17982            &mut expected_sa,
17983            &mut expected_bucket,
17984            block_start as FastSint,
17985            block_size as FastSint,
17986        );
17987        final_sorting_scan_left_to_right_8u_block_omp(
17988            &t,
17989            &mut threaded_sa,
17990            ALPHABET_SIZE as SaSint,
17991            &mut threaded_bucket,
17992            block_start as FastSint,
17993            block_size as FastSint,
17994            4,
17995            &mut thread_state,
17996        );
17997
17998        assert_eq!(threaded_sa, expected_sa);
17999        assert_eq!(threaded_bucket, expected_bucket);
18000    }
18001
18002    #[test]
18003    fn final_bwt_left_to_right_8u_block_omp_uses_thread_buckets() {
18004        let block_start = 20_000usize;
18005        let block_size = 16_384usize;
18006        let n = block_start + block_size + 8;
18007        let t = vec![1_u8; n];
18008        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18009
18010        let mut expected_sa = vec![0; n];
18011        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18012        let mut threaded_sa = expected_sa.clone();
18013        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18014        let mut threaded_bucket = expected_bucket.clone();
18015        let mut thread_state = alloc_thread_state(4).unwrap();
18016
18017        final_bwt_scan_left_to_right_8u(
18018            &t,
18019            &mut expected_sa,
18020            &mut expected_bucket,
18021            block_start as FastSint,
18022            block_size as FastSint,
18023        );
18024        final_bwt_scan_left_to_right_8u_block_omp(
18025            &t,
18026            &mut threaded_sa,
18027            ALPHABET_SIZE as SaSint,
18028            &mut threaded_bucket,
18029            block_start as FastSint,
18030            block_size as FastSint,
18031            4,
18032            &mut thread_state,
18033        );
18034
18035        assert_eq!(threaded_sa, expected_sa);
18036        assert_eq!(threaded_bucket, expected_bucket);
18037    }
18038
18039    #[test]
18040    fn final_bwt_aux_left_to_right_8u_block_omp_uses_thread_buckets() {
18041        let block_start = 20_000usize;
18042        let block_size = 16_384usize;
18043        let n = block_start + block_size + 8;
18044        let t = vec![1_u8; n];
18045        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18046
18047        let mut expected_sa = vec![0; n];
18048        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18049        let mut threaded_sa = expected_sa.clone();
18050        let mut expected_i = vec![0; n];
18051        let mut threaded_i = vec![0; n];
18052        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18053        let mut threaded_bucket = expected_bucket.clone();
18054        let mut thread_state = alloc_thread_state(4).unwrap();
18055
18056        final_bwt_aux_scan_left_to_right_8u(
18057            &t,
18058            &mut expected_sa,
18059            0,
18060            &mut expected_i,
18061            &mut expected_bucket,
18062            block_start as FastSint,
18063            block_size as FastSint,
18064        );
18065        final_bwt_aux_scan_left_to_right_8u_block_omp(
18066            &t,
18067            &mut threaded_sa,
18068            ALPHABET_SIZE as SaSint,
18069            0,
18070            &mut threaded_i,
18071            &mut threaded_bucket,
18072            block_start as FastSint,
18073            block_size as FastSint,
18074            4,
18075            &mut thread_state,
18076        );
18077
18078        assert_eq!(threaded_sa, expected_sa);
18079        assert_eq!(threaded_i, expected_i);
18080        assert_eq!(threaded_bucket, expected_bucket);
18081    }
18082
18083    #[test]
18084    fn final_bwt_scan_right_to_left_8u_returns_zero_index_and_induces_suffixes() {
18085        let t = vec![0_u8, 1, 2, 1, 0];
18086        let mut sa = vec![0, 2, 0];
18087        let mut induction_bucket = vec![1, 2, 3];
18088
18089        let index = final_bwt_scan_right_to_left_8u(&t, &mut sa, &mut induction_bucket, 0, 2);
18090
18091        assert_eq!(index, 0);
18092        assert_eq!(sa[1], 1);
18093        assert_eq!(induction_bucket[1], 1);
18094    }
18095
18096    #[test]
18097    fn final_sorting_scan_right_to_left_32s_block_omp_runs_block_pipeline() {
18098        let t = vec![0, 1, 2, 1, 0];
18099        let mut sa = vec![0, 2, 0, 0];
18100        let mut induction_bucket = vec![1, 2, 3];
18101        let mut expected_sa = sa.clone();
18102        let mut expected_bucket = induction_bucket.clone();
18103        let mut cache = vec![ThreadCache::default(); LIBSAIS_PER_THREAD_CACHE_SIZE];
18104
18105        final_sorting_scan_right_to_left_32s(&t, &mut expected_sa, &mut expected_bucket, 0, 2);
18106        final_sorting_scan_right_to_left_32s_block_omp(
18107            &t,
18108            &mut sa,
18109            &mut induction_bucket,
18110            &mut cache,
18111            0,
18112            2,
18113            2,
18114        );
18115
18116        assert_eq!(sa, expected_sa);
18117        assert_eq!(induction_bucket, expected_bucket);
18118    }
18119
18120    #[test]
18121    fn final_sorting_scan_right_to_left_8u_block_omp_uses_thread_buckets() {
18122        let block_start = 20_000usize;
18123        let block_size = 16_384usize;
18124        let n = block_start + block_size + 8;
18125        let t = vec![1_u8; n];
18126        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18127
18128        let mut expected_sa = vec![0; n];
18129        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18130        let mut threaded_sa = expected_sa.clone();
18131        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18132        expected_bucket[1] = n as SaSint;
18133        let mut threaded_bucket = expected_bucket.clone();
18134        let mut thread_state = alloc_thread_state(4).unwrap();
18135
18136        final_sorting_scan_right_to_left_8u(
18137            &t,
18138            &mut expected_sa,
18139            &mut expected_bucket,
18140            block_start as FastSint,
18141            block_size as FastSint,
18142        );
18143        final_sorting_scan_right_to_left_8u_block_omp(
18144            &t,
18145            &mut threaded_sa,
18146            ALPHABET_SIZE as SaSint,
18147            &mut threaded_bucket,
18148            block_start as FastSint,
18149            block_size as FastSint,
18150            4,
18151            &mut thread_state,
18152        );
18153
18154        assert_eq!(threaded_sa, expected_sa);
18155        assert_eq!(threaded_bucket, expected_bucket);
18156    }
18157
18158    #[test]
18159    fn final_bwt_right_to_left_8u_block_omp_uses_thread_buckets() {
18160        let block_start = 20_000usize;
18161        let block_size = 16_384usize;
18162        let n = block_start + block_size + 8;
18163        let t = vec![1_u8; n];
18164        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18165
18166        let mut expected_sa = vec![0; n];
18167        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18168        let mut threaded_sa = expected_sa.clone();
18169        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18170        expected_bucket[1] = n as SaSint;
18171        let mut threaded_bucket = expected_bucket.clone();
18172        let mut thread_state = alloc_thread_state(4).unwrap();
18173
18174        final_bwt_scan_right_to_left_8u(
18175            &t,
18176            &mut expected_sa,
18177            &mut expected_bucket,
18178            block_start as FastSint,
18179            block_size as FastSint,
18180        );
18181        final_bwt_scan_right_to_left_8u_block_omp(
18182            &t,
18183            &mut threaded_sa,
18184            ALPHABET_SIZE as SaSint,
18185            &mut threaded_bucket,
18186            block_start as FastSint,
18187            block_size as FastSint,
18188            4,
18189            &mut thread_state,
18190        );
18191
18192        assert_eq!(threaded_sa, expected_sa);
18193        assert_eq!(threaded_bucket, expected_bucket);
18194    }
18195
18196    #[test]
18197    fn final_bwt_aux_right_to_left_8u_block_omp_uses_thread_buckets() {
18198        let block_start = 20_000usize;
18199        let block_size = 16_384usize;
18200        let n = block_start + block_size + 8;
18201        let t = vec![1_u8; n];
18202        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18203
18204        let mut expected_sa = vec![0; n];
18205        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18206        let mut threaded_sa = expected_sa.clone();
18207        let mut expected_i = vec![0; n];
18208        let mut threaded_i = vec![0; n];
18209        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18210        expected_bucket[1] = n as SaSint;
18211        let mut threaded_bucket = expected_bucket.clone();
18212        let mut thread_state = alloc_thread_state(4).unwrap();
18213
18214        final_bwt_aux_scan_right_to_left_8u(
18215            &t,
18216            &mut expected_sa,
18217            0,
18218            &mut expected_i,
18219            &mut expected_bucket,
18220            block_start as FastSint,
18221            block_size as FastSint,
18222        );
18223        final_bwt_aux_scan_right_to_left_8u_block_omp(
18224            &t,
18225            &mut threaded_sa,
18226            ALPHABET_SIZE as SaSint,
18227            0,
18228            &mut threaded_i,
18229            &mut threaded_bucket,
18230            block_start as FastSint,
18231            block_size as FastSint,
18232            4,
18233            &mut thread_state,
18234        );
18235
18236        assert_eq!(threaded_sa, expected_sa);
18237        assert_eq!(threaded_i, expected_i);
18238        assert_eq!(threaded_bucket, expected_bucket);
18239    }
18240
18241    #[test]
18242    fn final_gsa_right_to_left_8u_block_omp_uses_thread_buckets() {
18243        let block_start = 20_000usize;
18244        let block_size = 16_384usize;
18245        let n = block_start + block_size + 8;
18246        let t = vec![1_u8; n];
18247        let suffixes: Vec<SaSint> = (2..2 + block_size).map(|i| i as SaSint).collect();
18248
18249        let mut expected_sa = vec![0; n];
18250        expected_sa[block_start..block_start + block_size].copy_from_slice(&suffixes);
18251        let mut threaded_sa = expected_sa.clone();
18252        let mut expected_bucket = vec![0; ALPHABET_SIZE];
18253        expected_bucket[1] = n as SaSint;
18254        let mut threaded_bucket = expected_bucket.clone();
18255        let mut thread_state = alloc_thread_state(4).unwrap();
18256
18257        final_gsa_scan_right_to_left_8u(
18258            &t,
18259            &mut expected_sa,
18260            &mut expected_bucket,
18261            block_start as FastSint,
18262            block_size as FastSint,
18263        );
18264        final_gsa_scan_right_to_left_8u_block_omp(
18265            &t,
18266            &mut threaded_sa,
18267            ALPHABET_SIZE as SaSint,
18268            &mut threaded_bucket,
18269            block_start as FastSint,
18270            block_size as FastSint,
18271            4,
18272            &mut thread_state,
18273        );
18274
18275        assert_eq!(threaded_sa, expected_sa);
18276        assert_eq!(threaded_bucket, expected_bucket);
18277    }
18278
18279    #[test]
18280    fn final_sorting_scan_right_to_left_8u_omp_matches_sequential_path() {
18281        let t = vec![0_u8, 1, 2, 1, 0];
18282        let mut sa = vec![0, 2, 0, 0];
18283        let mut induction_bucket = vec![1, 2, 3];
18284        let mut expected_sa = sa.clone();
18285        let mut expected_bucket = induction_bucket.clone();
18286
18287        final_sorting_scan_right_to_left_8u_omp(
18288            &t,
18289            &mut expected_sa,
18290            0,
18291            2,
18292            ALPHABET_SIZE as SaSint,
18293            &mut expected_bucket,
18294            1,
18295            &mut [],
18296        );
18297
18298        let mut thread_state = alloc_thread_state(2).unwrap();
18299        final_sorting_scan_right_to_left_8u_omp(
18300            &t,
18301            &mut sa,
18302            0,
18303            2,
18304            ALPHABET_SIZE as SaSint,
18305            &mut induction_bucket,
18306            2,
18307            &mut thread_state,
18308        );
18309
18310        assert_eq!(sa, expected_sa);
18311        assert_eq!(induction_bucket, expected_bucket);
18312    }
18313
18314    #[test]
18315    fn clear_lms_suffixes_omp_zeroes_requested_bucket_ranges() {
18316        let mut sa = vec![5, 4, 3, 2, 1, 9];
18317        let n = sa.len() as SaSint;
18318        let bucket_start = vec![1, 4, 5];
18319        let bucket_end = vec![3, 5, 5];
18320
18321        clear_lms_suffixes_omp(&mut sa, n, 3, &bucket_start, &bucket_end, 2);
18322
18323        assert_eq!(sa, vec![5, 0, 0, 2, 0, 9]);
18324    }
18325
18326    #[test]
18327    fn induce_final_order_8u_omp_non_bwt_matches_direct_final_scans() {
18328        let t = vec![0_u8, 1, 2, 1, 0];
18329        let mut sa = vec![0, 2, 0, 0, 0];
18330        let mut buckets = vec![0; 8 * ALPHABET_SIZE];
18331        buckets[6 * ALPHABET_SIZE..6 * ALPHABET_SIZE + 3].copy_from_slice(&[0, 1, 3]);
18332        buckets[7 * ALPHABET_SIZE..7 * ALPHABET_SIZE + 3].copy_from_slice(&[2, 4, 5]);
18333
18334        let mut expected_sa = sa.clone();
18335        let mut expected_left = vec![0, 1, 3];
18336        let mut expected_right = vec![2, 4, 5];
18337        final_sorting_scan_left_to_right_8u_omp(
18338            &t,
18339            &mut expected_sa,
18340            t.len() as FastSint,
18341            ALPHABET_SIZE as SaSint,
18342            &mut expected_left,
18343            1,
18344            &mut [],
18345        );
18346        final_sorting_scan_right_to_left_8u_omp(
18347            &t,
18348            &mut expected_sa,
18349            0,
18350            t.len() as FastSint,
18351            ALPHABET_SIZE as SaSint,
18352            &mut expected_right,
18353            1,
18354            &mut [],
18355        );
18356
18357        let mut thread_state = alloc_thread_state(2).unwrap();
18358        let result = induce_final_order_8u_omp(
18359            &t,
18360            &mut sa,
18361            t.len() as SaSint,
18362            ALPHABET_SIZE as SaSint,
18363            LIBSAIS_FLAGS_NONE,
18364            0,
18365            None,
18366            &mut buckets,
18367            2,
18368            &mut thread_state,
18369        );
18370
18371        assert_eq!(result, 0);
18372        assert_eq!(sa, expected_sa);
18373        assert_eq!(
18374            &buckets[6 * ALPHABET_SIZE..6 * ALPHABET_SIZE + 3],
18375            expected_left.as_slice()
18376        );
18377        assert_eq!(
18378            &buckets[7 * ALPHABET_SIZE..7 * ALPHABET_SIZE + 3],
18379            expected_right.as_slice()
18380        );
18381    }
18382
18383    #[test]
18384    fn renumber_unique_and_nonunique_lms_suffixes_32s_marks_new_unique_names() {
18385        let mut t = vec![0, 0, 0, 0];
18386        let mut sa = vec![0, 2, -1, 5];
18387
18388        let f = renumber_unique_and_nonunique_lms_suffixes_32s(&mut t, &mut sa, 2, 0, 0, 2);
18389
18390        assert_eq!(f, 1);
18391        assert_eq!(t[0], SAINT_MIN);
18392        assert_eq!(sa[2], SAINT_MIN);
18393        assert_eq!(sa[3], 4);
18394    }
18395
18396    #[test]
18397    fn renumber_unique_and_nonunique_lms_suffixes_32s_matches_upstream_c_helper() {
18398        let mut t_rust = vec![0, 0, 0, 0];
18399        let mut sa_rust = vec![0, 2, -1, 5];
18400        let mut t_c = t_rust.clone();
18401        let mut sa_c = sa_rust.clone();
18402
18403        let rust_f =
18404            renumber_unique_and_nonunique_lms_suffixes_32s(&mut t_rust, &mut sa_rust, 2, 0, 0, 2);
18405        let c_f = unsafe {
18406            probe_renumber_unique_and_nonunique_lms_suffixes_32s(
18407                t_c.as_mut_ptr(),
18408                sa_c.as_mut_ptr(),
18409                2,
18410                0,
18411                0,
18412                2,
18413            )
18414        };
18415
18416        assert_eq!(rust_f, c_f);
18417        assert_eq!(t_rust, t_c);
18418        assert_eq!(sa_rust, sa_c);
18419    }
18420
18421    #[test]
18422    fn renumber_unique_and_nonunique_lms_suffixes_32s_omp_matches_upstream_c_helper() {
18423        let mut t_rust = vec![0, 0, 0, 0];
18424        let mut sa_rust = vec![0, 2, -1, 5];
18425        let mut t_c = t_rust.clone();
18426        let mut sa_c = sa_rust.clone();
18427        let mut thread_state = alloc_thread_state(1).unwrap();
18428
18429        let rust_f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(
18430            &mut t_rust,
18431            &mut sa_rust,
18432            2,
18433            1,
18434            &mut thread_state,
18435        );
18436        let c_f = unsafe {
18437            probe_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
18438                t_c.as_mut_ptr(),
18439                sa_c.as_mut_ptr(),
18440                2,
18441                1,
18442            )
18443        };
18444
18445        assert_eq!(rust_f, c_f);
18446        assert_eq!(t_rust, t_c);
18447        assert_eq!(sa_rust, sa_c);
18448    }
18449
18450    #[test]
18451    fn renumber_unique_and_nonunique_lms_suffixes_32s_omp_uses_block_partition() {
18452        let m = 65_600usize;
18453        let n = 2 * m;
18454        let t = vec![0; n];
18455        let mut sa = vec![0; n];
18456        for i in 0..m {
18457            sa[i] = (2 * i) as SaSint;
18458            sa[m + i] = if i % 5 == 0 {
18459                -((i as SaSint) + 1)
18460            } else {
18461                i as SaSint + 7
18462            };
18463        }
18464
18465        let mut single_t = t.clone();
18466        let mut single_sa = sa.clone();
18467        let mut threaded_t = t;
18468        let mut threaded_sa = sa;
18469        let mut thread_state = alloc_thread_state(4).unwrap();
18470        let single_f = renumber_unique_and_nonunique_lms_suffixes_32s(
18471            &mut single_t,
18472            &mut single_sa,
18473            m as SaSint,
18474            0,
18475            0,
18476            m as FastSint,
18477        );
18478        let threaded_f = renumber_unique_and_nonunique_lms_suffixes_32s_omp(
18479            &mut threaded_t,
18480            &mut threaded_sa,
18481            m as SaSint,
18482            4,
18483            &mut thread_state,
18484        );
18485
18486        assert_eq!(threaded_f, single_f);
18487        assert_eq!(threaded_t, single_t);
18488        assert_eq!(threaded_sa, single_sa);
18489    }
18490
18491    #[test]
18492    fn compact_unique_and_nonunique_lms_suffixes_32s_splits_unique_and_nonunique_ranges() {
18493        let mut sa = vec![0, 0, 0, 0, SAINT_MIN, 4];
18494        let mut l = 2;
18495        let mut r = 6;
18496
18497        compact_unique_and_nonunique_lms_suffixes_32s(&mut sa, 2, &mut l, &mut r, 0, 2);
18498
18499        assert_eq!(l, 2);
18500        assert_eq!(r, 6);
18501        assert_eq!(sa[2], 0);
18502        assert_eq!(sa[3] & SAINT_MAX, 0);
18503    }
18504
18505    #[test]
18506    fn compact_lms_suffixes_32s_omp_runs_renumber_then_compaction() {
18507        let mut t = vec![0, 0, 0, 0];
18508        let mut sa = vec![0, 2, -1, 5, 77, 88];
18509        let mut thread_state = alloc_thread_state(2).unwrap();
18510
18511        let f = compact_lms_suffixes_32s_omp(&mut t, &mut sa, 4, 2, 2, 2, &mut thread_state);
18512
18513        assert_eq!(f, 1);
18514        assert_eq!(sa[2] & SAINT_MAX, 0);
18515        assert_eq!(sa[5], 3);
18516    }
18517
18518    #[test]
18519    fn compact_unique_and_nonunique_lms_suffixes_32s_omp_uses_block_partition() {
18520        let n = 131_200usize;
18521        let m = 65_600usize;
18522        let fs = m + 32;
18523        let half_n = n >> 1;
18524        let f = m / 5;
18525        let mut sa = vec![0; n + fs];
18526        for i in 0..half_n {
18527            sa[m + i] = if i % 5 == 0 {
18528                SAINT_MIN | i as SaSint
18529            } else {
18530                i as SaSint + 1
18531            };
18532        }
18533        for i in 0..f {
18534            sa[m - f + i] = (10_000 + i) as SaSint;
18535        }
18536
18537        let mut single = sa.clone();
18538        let mut threaded = sa;
18539        let mut single_state = alloc_thread_state(1).unwrap();
18540        let mut threaded_state = alloc_thread_state(4).unwrap();
18541        compact_unique_and_nonunique_lms_suffixes_32s_omp(
18542            &mut single,
18543            n as SaSint,
18544            m as SaSint,
18545            fs as SaSint,
18546            f as SaSint,
18547            1,
18548            &mut single_state,
18549        );
18550        compact_unique_and_nonunique_lms_suffixes_32s_omp(
18551            &mut threaded,
18552            n as SaSint,
18553            m as SaSint,
18554            fs as SaSint,
18555            f as SaSint,
18556            4,
18557            &mut threaded_state,
18558        );
18559
18560        let unique_dst = n + fs - m;
18561        assert_eq!(
18562            &threaded[unique_dst..unique_dst + f],
18563            &single[unique_dst..unique_dst + f]
18564        );
18565    }
18566
18567    #[test]
18568    fn compact_lms_suffixes_32s_omp_uses_large_input_paths() {
18569        let n = 131_200usize;
18570        let m = 65_600usize;
18571        let fs = m + 32;
18572        let t = vec![0; n];
18573        let mut sa = vec![0; n + fs];
18574        for i in 0..m {
18575            sa[i] = (2 * i) as SaSint;
18576            sa[m + i] = if i % 5 == 0 {
18577                -((i as SaSint) + 1)
18578            } else {
18579                i as SaSint + 7
18580            };
18581        }
18582
18583        let mut single_t = t.clone();
18584        let mut single_sa = sa.clone();
18585        let mut threaded_t = t;
18586        let mut threaded_sa = sa;
18587        let mut single_state = alloc_thread_state(1).unwrap();
18588        let mut threaded_state = alloc_thread_state(4).unwrap();
18589        let single_f = compact_lms_suffixes_32s_omp(
18590            &mut single_t,
18591            &mut single_sa,
18592            n as SaSint,
18593            m as SaSint,
18594            fs as SaSint,
18595            1,
18596            &mut single_state,
18597        );
18598        let threaded_f = compact_lms_suffixes_32s_omp(
18599            &mut threaded_t,
18600            &mut threaded_sa,
18601            n as SaSint,
18602            m as SaSint,
18603            fs as SaSint,
18604            4,
18605            &mut threaded_state,
18606        );
18607
18608        assert_eq!(threaded_f, single_f);
18609        assert_eq!(threaded_t, single_t);
18610        let unique_dst = n + fs - m;
18611        let unique_len = usize::try_from(threaded_f).expect("f must be non-negative");
18612        assert_eq!(
18613            &threaded_sa[unique_dst..unique_dst + unique_len],
18614            &single_sa[unique_dst..unique_dst + unique_len]
18615        );
18616    }
18617
18618    #[test]
18619    fn merge_unique_lms_suffixes_32s_noops_for_empty_block() {
18620        let mut t = vec![1, SAINT_MIN, 2, SAINT_MIN];
18621        let mut sa = vec![0, 0, 1, 3];
18622        let before_t = t.clone();
18623        let before_sa = sa.clone();
18624
18625        merge_unique_lms_suffixes_32s(&mut t, &mut sa, 4, 1, 0, 0, 0);
18626
18627        assert_eq!(t, before_t);
18628        assert_eq!(sa, before_sa);
18629    }
18630
18631    #[test]
18632    fn merge_nonunique_lms_suffixes_32s_noops_for_empty_block() {
18633        let mut sa = vec![0, 7, 0, 13, 11];
18634        let before = sa.clone();
18635
18636        merge_nonunique_lms_suffixes_32s(&mut sa, 4, 1, 0, 0, 0);
18637
18638        assert_eq!(sa, before);
18639    }
18640
18641    #[test]
18642    fn merge_compacted_lms_suffixes_32s_omp_preserves_input_text_and_fills_zero_slots() {
18643        let mut t = vec![1, 2, 3, 4];
18644        let mut sa = vec![0, 1, 2, 3, 4, 5];
18645        let before_t = t.clone();
18646        let mut thread_state = alloc_thread_state(2).unwrap();
18647
18648        merge_compacted_lms_suffixes_32s_omp(&mut t, &mut sa, 4, 1, 1, 2, &mut thread_state);
18649
18650        assert_eq!(t, before_t);
18651        assert_eq!(sa[0], 3);
18652        assert_eq!(sa[1], 1);
18653    }
18654
18655    #[test]
18656    fn merge_unique_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18657        let n = 65_600usize;
18658        let m = 1_024usize;
18659        let mut t = vec![1; n];
18660        for i in (0..n).step_by(257) {
18661            t[i] = SAINT_MIN | ((i % 251) as SaSint);
18662        }
18663        let f = t.iter().filter(|&&value| value < 0).count();
18664        let mut sa = vec![-1; n];
18665        let src = n - m - 1;
18666        for i in 0..f {
18667            sa[src + i] = i as SaSint;
18668        }
18669
18670        let mut single_t = t.clone();
18671        let mut single_sa = sa.clone();
18672        let mut threaded_t = t;
18673        let mut threaded_sa = sa;
18674        let mut thread_state = alloc_thread_state(4).unwrap();
18675        merge_unique_lms_suffixes_32s_omp(
18676            &mut single_t,
18677            &mut single_sa,
18678            n as SaSint,
18679            m as SaSint,
18680            1,
18681            &mut [],
18682        );
18683        merge_unique_lms_suffixes_32s_omp(
18684            &mut threaded_t,
18685            &mut threaded_sa,
18686            n as SaSint,
18687            m as SaSint,
18688            4,
18689            &mut thread_state,
18690        );
18691
18692        assert_eq!(threaded_t, single_t);
18693        assert_eq!(threaded_sa, single_sa);
18694    }
18695
18696    #[test]
18697    fn merge_nonunique_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18698        let n = 131_200usize;
18699        let m = 65_600usize;
18700        let f = 7usize;
18701        let mut sa = vec![1; n];
18702        let zero_count = (0..m).filter(|i| i % 17 == 0).count();
18703        for i in (0..m).step_by(17) {
18704            sa[i] = 0;
18705        }
18706        let src = n - m - 1 + f;
18707        for i in 0..zero_count {
18708            sa[src + i] = 10_000 + i as SaSint;
18709        }
18710
18711        let mut single = sa.clone();
18712        let mut threaded = sa;
18713        let mut thread_state = alloc_thread_state(4).unwrap();
18714        merge_nonunique_lms_suffixes_32s_omp(
18715            &mut single,
18716            n as SaSint,
18717            m as SaSint,
18718            f as SaSint,
18719            1,
18720            &mut [],
18721        );
18722        merge_nonunique_lms_suffixes_32s_omp(
18723            &mut threaded,
18724            n as SaSint,
18725            m as SaSint,
18726            f as SaSint,
18727            4,
18728            &mut thread_state,
18729        );
18730
18731        assert_eq!(threaded, single);
18732    }
18733
18734    #[test]
18735    fn merge_compacted_lms_suffixes_32s_omp_uses_block_partition_for_large_inputs() {
18736        let n = 131_200usize;
18737        let m = 65_600usize;
18738        let mut t = vec![1; n];
18739        for i in (0..n).step_by(257) {
18740            t[i] = SAINT_MIN | ((i % 251) as SaSint);
18741        }
18742        let f = t.iter().filter(|&&value| value < 0).count();
18743
18744        let mut sa = vec![1; n];
18745        let zero_count = (0..m).filter(|i| i % 17 == 0).count();
18746        for i in (0..m).step_by(17) {
18747            sa[i] = 0;
18748        }
18749        let unique_src = n - m - 1;
18750        for i in 0..f {
18751            sa[unique_src + i] = i as SaSint;
18752        }
18753        for i in 0..zero_count {
18754            sa[unique_src + f + i] = 10_000 + i as SaSint;
18755        }
18756
18757        let mut single_t = t.clone();
18758        let mut single_sa = sa.clone();
18759        let mut threaded_t = t;
18760        let mut threaded_sa = sa;
18761        let mut single_state = alloc_thread_state(1).unwrap();
18762        let mut threaded_state = alloc_thread_state(4).unwrap();
18763        merge_compacted_lms_suffixes_32s_omp(
18764            &mut single_t,
18765            &mut single_sa,
18766            n as SaSint,
18767            m as SaSint,
18768            f as SaSint,
18769            1,
18770            &mut single_state,
18771        );
18772        merge_compacted_lms_suffixes_32s_omp(
18773            &mut threaded_t,
18774            &mut threaded_sa,
18775            n as SaSint,
18776            m as SaSint,
18777            f as SaSint,
18778            4,
18779            &mut threaded_state,
18780        );
18781
18782        assert_eq!(threaded_t, single_t);
18783        assert_eq!(threaded_sa, single_sa);
18784    }
18785
18786    #[test]
18787    fn bwt_copy_8u_copies_low_bytes_from_suffix_array_storage() {
18788        let a = vec![65, 255, 256, -1];
18789        let mut u = vec![0_u8; 4];
18790
18791        bwt_copy_8u(&mut u, &a, 4);
18792
18793        assert_eq!(u, vec![65, 255, 0, 255]);
18794    }
18795
18796    #[test]
18797    fn bwt_copy_8u_omp_matches_sequential_copy() {
18798        let a = vec![1, 2, 3, 4, 5];
18799        let mut u = vec![0_u8; 5];
18800
18801        bwt_copy_8u_omp(&mut u, &a, 5, 4);
18802
18803        assert_eq!(u, vec![1, 2, 3, 4, 5]);
18804    }
18805
18806    #[test]
18807    fn bwt_copy_8u_omp_uses_block_partition_for_large_inputs() {
18808        let n = 65_600usize;
18809        let a: Vec<SaSint> = (0..n).map(|i| (i * 17) as SaSint).collect();
18810        let mut threaded = vec![0; n];
18811        let mut sequential = vec![0; n];
18812
18813        bwt_copy_8u_omp(&mut threaded, &a, n as SaSint, 4);
18814        bwt_copy_8u(&mut sequential, &a, n as SaSint);
18815
18816        assert_eq!(threaded, sequential);
18817    }
18818
18819    #[test]
18820    fn plcp_lcp_omp_wrappers_match_single_thread_on_large_inputs() {
18821        let n = 65_600usize;
18822        let text: Vec<u8> = (0..n).map(|i| (1 + (i % 251)) as u8).collect();
18823        let sa: Vec<SaSint> = (0..n as SaSint).collect();
18824
18825        let mut plcp_single = vec![0; n];
18826        let mut plcp_threaded = vec![0; n];
18827        compute_phi_omp(&sa, &mut plcp_single, n as SaSint, 1);
18828        compute_phi_omp(&sa, &mut plcp_threaded, n as SaSint, 4);
18829        assert_eq!(plcp_threaded, plcp_single);
18830
18831        compute_plcp_omp(&text, &mut plcp_single, n as SaSint, 1);
18832        compute_plcp_omp(&text, &mut plcp_threaded, n as SaSint, 4);
18833        assert_eq!(plcp_threaded, plcp_single);
18834
18835        let mut lcp_single = vec![0; n];
18836        let mut lcp_threaded = vec![0; n];
18837        compute_lcp_omp(&plcp_single, &sa, &mut lcp_single, n as SaSint, 1);
18838        compute_lcp_omp(&plcp_threaded, &sa, &mut lcp_threaded, n as SaSint, 4);
18839        assert_eq!(lcp_threaded, lcp_single);
18840    }
18841
18842    #[test]
18843    fn count_and_gather_lms_suffixes_8u_omp_preserves_sequential_wrapper_behavior() {
18844        let t = vec![2_u8, 1, 3, 1, 0];
18845        let mut sa = vec![0; t.len()];
18846        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
18847        let mut thread_state = alloc_thread_state(2).unwrap();
18848        let m = count_and_gather_lms_suffixes_8u_omp(
18849            &t,
18850            &mut sa,
18851            t.len() as SaSint,
18852            &mut buckets,
18853            2,
18854            &mut thread_state,
18855        );
18856        assert_eq!(m, 1);
18857        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18858    }
18859
18860    #[test]
18861    fn count_and_gather_lms_suffixes_8u_omp_uses_block_partition_for_large_inputs() {
18862        let n = 65_600usize;
18863        let text: Vec<u8> = (0..n)
18864            .map(|i| 1 + ((i * 37 + i / 17) % 251) as u8)
18865            .collect();
18866
18867        let mut sa_threaded = vec![-99; n];
18868        let mut sa_scalar = vec![-99; n];
18869        let mut buckets_threaded = vec![0; 4 * ALPHABET_SIZE];
18870        let mut buckets_scalar = vec![0; 4 * ALPHABET_SIZE];
18871        let mut thread_state = alloc_thread_state(4).unwrap();
18872
18873        let m_threaded = count_and_gather_lms_suffixes_8u_omp(
18874            &text,
18875            &mut sa_threaded,
18876            n as SaSint,
18877            &mut buckets_threaded,
18878            4,
18879            &mut thread_state,
18880        );
18881        let m_scalar = count_and_gather_lms_suffixes_8u(
18882            &text,
18883            &mut sa_scalar,
18884            n as SaSint,
18885            &mut buckets_scalar,
18886            0,
18887            n as FastSint,
18888        );
18889
18890        assert_eq!(m_threaded, m_scalar);
18891        assert_eq!(
18892            &sa_threaded[n - m_threaded as usize..],
18893            &sa_scalar[n - m_scalar as usize..]
18894        );
18895        assert_eq!(buckets_threaded, buckets_scalar);
18896    }
18897
18898    #[test]
18899    fn gather_lms_suffixes_8u_omp_uses_thread_state_for_large_inputs() {
18900        let n = 65_600usize;
18901        let text: Vec<u8> = (0..n)
18902            .map(|i| 1 + ((i * 37 + i / 17) % 251) as u8)
18903            .collect();
18904        let mut thread_state = alloc_thread_state(4).unwrap();
18905        let mut count_sa = vec![-99; n];
18906        let mut buckets = vec![0; 4 * ALPHABET_SIZE];
18907        let m = count_and_gather_lms_suffixes_8u_omp(
18908            &text,
18909            &mut count_sa,
18910            n as SaSint,
18911            &mut buckets,
18912            4,
18913            &mut thread_state,
18914        );
18915
18916        let mut threaded = vec![-99; n];
18917        let mut scalar = vec![-99; n];
18918        gather_lms_suffixes_8u_omp(&text, &mut threaded, n as SaSint, 4, &mut thread_state);
18919        gather_lms_suffixes_8u(
18920            &text,
18921            &mut scalar,
18922            n as SaSint,
18923            n as FastSint - 1,
18924            0,
18925            n as FastSint,
18926        );
18927
18928        assert_eq!(&threaded[n - m as usize..], &scalar[n - m as usize..]);
18929    }
18930
18931    #[test]
18932    fn count_and_gather_lms_suffixes_32s_4k_updates_counts_and_suffixes() {
18933        let t = vec![2, 1, 3, 1, 0];
18934        let mut sa = vec![0; t.len()];
18935        let mut buckets = vec![0; 4 * 4];
18936        let m = count_and_gather_lms_suffixes_32s_4k(
18937            &t,
18938            &mut sa,
18939            t.len() as SaSint,
18940            4,
18941            &mut buckets,
18942            0,
18943            t.len() as FastSint,
18944        );
18945        assert!(m >= 0);
18946        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18947    }
18948
18949    #[test]
18950    fn count_and_gather_lms_suffixes_32s_2k_updates_counts_and_suffixes() {
18951        let t = vec![2, 1, 3, 1, 0];
18952        let mut sa = vec![0; t.len()];
18953        let mut buckets = vec![0; 2 * 4];
18954        let m = count_and_gather_lms_suffixes_32s_2k(
18955            &t,
18956            &mut sa,
18957            t.len() as SaSint,
18958            4,
18959            &mut buckets,
18960            0,
18961            t.len() as FastSint,
18962        );
18963        assert!(m >= 0);
18964        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18965    }
18966
18967    #[test]
18968    fn count_and_gather_compacted_lms_suffixes_32s_2k_updates_counts_and_suffixes() {
18969        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
18970        let mut sa = vec![0; t.len()];
18971        let mut buckets = vec![0; 2 * 4];
18972        let m = count_and_gather_compacted_lms_suffixes_32s_2k(
18973            &t,
18974            &mut sa,
18975            t.len() as SaSint,
18976            4,
18977            &mut buckets,
18978            0,
18979            t.len() as FastSint,
18980        );
18981        assert!(m >= 0);
18982        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
18983    }
18984
18985    #[test]
18986    fn count_and_gather_lms_suffixes_32s_4k_nofs_omp_wraps_sequential_version() {
18987        let t = vec![2, 1, 3, 1, 0];
18988        let mut sa = vec![0; t.len()];
18989        let mut buckets = vec![0; 4 * 4];
18990        let m = count_and_gather_lms_suffixes_32s_4k_nofs_omp(
18991            &t,
18992            &mut sa,
18993            t.len() as SaSint,
18994            4,
18995            &mut buckets,
18996            2,
18997        );
18998        assert!(m >= 0);
18999        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
19000    }
19001
19002    #[test]
19003    fn count_and_gather_lms_suffixes_32s_2k_nofs_omp_wraps_sequential_version() {
19004        let t = vec![2, 1, 3, 1, 0];
19005        let mut sa = vec![0; t.len()];
19006        let mut buckets = vec![0; 2 * 4];
19007        let m = count_and_gather_lms_suffixes_32s_2k_nofs_omp(
19008            &t,
19009            &mut sa,
19010            t.len() as SaSint,
19011            4,
19012            &mut buckets,
19013            2,
19014        );
19015        assert!(m >= 0);
19016        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
19017    }
19018
19019    #[test]
19020    fn count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp_wraps_sequential_version() {
19021        let t = vec![2, SAINT_MIN | 1, 3, 1, 0];
19022        let mut sa = vec![0; t.len()];
19023        let mut buckets = vec![0; 2 * 4];
19024        let m = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
19025            &t,
19026            &mut sa,
19027            t.len() as SaSint,
19028            4,
19029            &mut buckets,
19030            2,
19031        );
19032        assert!(m >= 0);
19033        assert_eq!(buckets.iter().sum::<SaSint>(), t.len() as SaSint);
19034    }
19035
19036    #[test]
19037    fn count_and_gather_lms_suffixes_32s_nofs_omp_uses_large_input_paths() {
19038        let n = 65_600usize;
19039        let k = 257usize;
19040        let text: Vec<SaSint> = (0..n)
19041            .map(|i| 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint)
19042            .collect();
19043
19044        let mut sa_threaded = vec![-99; n];
19045        let mut sa_scalar = vec![-99; n];
19046        let mut buckets_threaded = vec![0; 4 * k];
19047        let mut buckets_scalar = vec![0; 4 * k];
19048        let m_threaded = count_and_gather_lms_suffixes_32s_4k_nofs_omp(
19049            &text,
19050            &mut sa_threaded,
19051            n as SaSint,
19052            k as SaSint,
19053            &mut buckets_threaded,
19054            4,
19055        );
19056        let m_scalar = count_and_gather_lms_suffixes_32s_4k(
19057            &text,
19058            &mut sa_scalar,
19059            n as SaSint,
19060            k as SaSint,
19061            &mut buckets_scalar,
19062            0,
19063            n as FastSint,
19064        );
19065        assert_eq!(m_threaded, m_scalar);
19066        assert_eq!(
19067            &sa_threaded[n - m_threaded as usize..],
19068            &sa_scalar[n - m_scalar as usize..]
19069        );
19070        assert_eq!(buckets_threaded, buckets_scalar);
19071
19072        let mut sa_threaded = vec![-99; n];
19073        let mut sa_scalar = vec![-99; n];
19074        let mut buckets_threaded = vec![0; 2 * k];
19075        let mut buckets_scalar = vec![0; 2 * k];
19076        let m_threaded = count_and_gather_lms_suffixes_32s_2k_nofs_omp(
19077            &text,
19078            &mut sa_threaded,
19079            n as SaSint,
19080            k as SaSint,
19081            &mut buckets_threaded,
19082            4,
19083        );
19084        let m_scalar = count_and_gather_lms_suffixes_32s_2k(
19085            &text,
19086            &mut sa_scalar,
19087            n as SaSint,
19088            k as SaSint,
19089            &mut buckets_scalar,
19090            0,
19091            n as FastSint,
19092        );
19093        assert_eq!(m_threaded, m_scalar);
19094        assert_eq!(
19095            &sa_threaded[n - m_threaded as usize..],
19096            &sa_scalar[n - m_scalar as usize..]
19097        );
19098        assert_eq!(buckets_threaded, buckets_scalar);
19099    }
19100
19101    #[test]
19102    fn count_and_gather_lms_suffixes_32s_fs_omp_uses_large_input_paths() {
19103        let n = 65_600usize;
19104        let k = 257usize;
19105        let text: Vec<SaSint> = (0..n)
19106            .map(|i| 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint)
19107            .collect();
19108        let mut thread_state = alloc_thread_state(4).unwrap();
19109
19110        let mut sa_threaded = vec![-99; n];
19111        let mut sa_scalar = vec![-99; n];
19112        let mut buckets_threaded = vec![0; 4 * k];
19113        let mut buckets_scalar = vec![0; 4 * k];
19114        let m_threaded = count_and_gather_lms_suffixes_32s_4k_fs_omp(
19115            &text,
19116            &mut sa_threaded,
19117            n as SaSint,
19118            k as SaSint,
19119            &mut buckets_threaded,
19120            0,
19121            4,
19122            &mut thread_state,
19123        );
19124        let m_scalar = count_and_gather_lms_suffixes_32s_4k(
19125            &text,
19126            &mut sa_scalar,
19127            n as SaSint,
19128            k as SaSint,
19129            &mut buckets_scalar,
19130            0,
19131            n as FastSint,
19132        );
19133        assert_eq!(m_threaded, m_scalar);
19134        assert_eq!(
19135            &sa_threaded[n - m_threaded as usize..],
19136            &sa_scalar[n - m_scalar as usize..]
19137        );
19138        assert_eq!(buckets_threaded, buckets_scalar);
19139
19140        let mut sa_threaded = vec![-99; n];
19141        let mut sa_scalar = vec![-99; n];
19142        let mut buckets_threaded = vec![0; 2 * k];
19143        let mut buckets_scalar = vec![0; 2 * k];
19144        let m_threaded = count_and_gather_lms_suffixes_32s_2k_fs_omp(
19145            &text,
19146            &mut sa_threaded,
19147            n as SaSint,
19148            k as SaSint,
19149            &mut buckets_threaded,
19150            0,
19151            4,
19152            &mut thread_state,
19153        );
19154        let m_scalar = count_and_gather_lms_suffixes_32s_2k(
19155            &text,
19156            &mut sa_scalar,
19157            n as SaSint,
19158            k as SaSint,
19159            &mut buckets_scalar,
19160            0,
19161            n as FastSint,
19162        );
19163        assert_eq!(m_threaded, m_scalar);
19164        assert_eq!(
19165            &sa_threaded[n - m_threaded as usize..],
19166            &sa_scalar[n - m_scalar as usize..]
19167        );
19168        assert_eq!(buckets_threaded, buckets_scalar);
19169    }
19170
19171    #[test]
19172    fn count_and_gather_compacted_lms_suffixes_32s_nofs_omp_uses_large_input_path() {
19173        let n = 65_600usize;
19174        let k = 257usize;
19175        let text: Vec<SaSint> = (0..n)
19176            .map(|i| {
19177                let value = 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint;
19178                if i % 19 == 0 {
19179                    value | SAINT_MIN
19180                } else {
19181                    value
19182                }
19183            })
19184            .collect();
19185
19186        let mut sa_threaded = vec![-99; n];
19187        let mut sa_split = vec![-99; n];
19188        let mut buckets_threaded = vec![0; 2 * k];
19189        let mut buckets_split = vec![0; 2 * k];
19190        let m_threaded = count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
19191            &text,
19192            &mut sa_threaded,
19193            n as SaSint,
19194            k as SaSint,
19195            &mut buckets_threaded,
19196            4,
19197        );
19198        count_compacted_lms_suffixes_32s_2k(&text, n as SaSint, k as SaSint, &mut buckets_split);
19199        let m_split = gather_compacted_lms_suffixes_32s(&text, &mut sa_split, n as SaSint);
19200
19201        assert_eq!(m_threaded, m_split);
19202        assert_eq!(
19203            &sa_threaded[n - m_threaded as usize..],
19204            &sa_split[n - m_split as usize..]
19205        );
19206        assert_eq!(buckets_threaded, buckets_split);
19207    }
19208
19209    #[test]
19210    fn count_and_gather_compacted_lms_suffixes_32s_fs_omp_uses_large_input_path() {
19211        let n = 65_600usize;
19212        let k = 257usize;
19213        let text: Vec<SaSint> = (0..n)
19214            .map(|i| {
19215                let value = 1 + ((i * 37 + i / 17) % (k - 1)) as SaSint;
19216                if i % 19 == 0 {
19217                    value | SAINT_MIN
19218                } else {
19219                    value
19220                }
19221            })
19222            .collect();
19223
19224        let mut sa_threaded = vec![-99; 2 * n];
19225        let mut sa_scalar = vec![-99; n];
19226        let mut buckets_threaded = vec![0; 2 * k];
19227        let mut buckets_scalar = vec![0; 2 * k];
19228        let mut thread_state = alloc_thread_state(4).unwrap();
19229        count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
19230            &text,
19231            &mut sa_threaded,
19232            n as SaSint,
19233            k as SaSint,
19234            &mut buckets_threaded,
19235            0,
19236            4,
19237            &mut thread_state,
19238        );
19239        let m_scalar = count_and_gather_compacted_lms_suffixes_32s_2k(
19240            &text,
19241            &mut sa_scalar,
19242            n as SaSint,
19243            k as SaSint,
19244            &mut buckets_scalar,
19245            0,
19246            n as FastSint,
19247        );
19248
19249        assert_eq!(
19250            &sa_threaded[n - m_scalar as usize..n],
19251            &sa_scalar[n - m_scalar as usize..]
19252        );
19253        assert_eq!(buckets_threaded, buckets_scalar);
19254    }
19255
19256    #[test]
19257    fn accumulate_counts_helpers_match_prefix_bucket_addition() {
19258        let mut bucket00 = vec![4, 5, 6];
19259        let bucket01 = vec![1, 2, 3];
19260        let bucket02 = vec![7, 8, 9];
19261        let bucket03 = vec![10, 11, 12];
19262        let bucket04 = vec![13, 14, 15];
19263        let bucket05 = vec![16, 17, 18];
19264        let bucket06 = vec![19, 20, 21];
19265        let bucket07 = vec![22, 23, 24];
19266        let bucket08 = vec![25, 26, 27];
19267
19268        accumulate_counts_s32_2(&mut bucket00, &bucket01);
19269        assert_eq!(bucket00, vec![5, 7, 9]);
19270
19271        accumulate_counts_s32_3(&mut bucket00, &bucket01, &bucket02);
19272        assert_eq!(bucket00, vec![13, 17, 21]);
19273
19274        accumulate_counts_s32_4(&mut bucket00, &bucket01, &bucket02, &bucket03);
19275        assert_eq!(bucket00, vec![31, 38, 45]);
19276
19277        accumulate_counts_s32_5(&mut bucket00, &bucket01, &bucket02, &bucket03, &bucket04);
19278        assert_eq!(bucket00, vec![62, 73, 84]);
19279
19280        accumulate_counts_s32_6(
19281            &mut bucket00,
19282            &bucket01,
19283            &bucket02,
19284            &bucket03,
19285            &bucket04,
19286            &bucket05,
19287        );
19288        assert_eq!(bucket00, vec![109, 125, 141]);
19289
19290        accumulate_counts_s32_7(
19291            &mut bucket00,
19292            &bucket01,
19293            &bucket02,
19294            &bucket03,
19295            &bucket04,
19296            &bucket05,
19297            &bucket06,
19298        );
19299        assert_eq!(bucket00, vec![175, 197, 219]);
19300
19301        accumulate_counts_s32_8(
19302            &mut bucket00,
19303            &bucket01,
19304            &bucket02,
19305            &bucket03,
19306            &bucket04,
19307            &bucket05,
19308            &bucket06,
19309            &bucket07,
19310        );
19311        assert_eq!(bucket00, vec![263, 292, 321]);
19312
19313        accumulate_counts_s32_9(
19314            &mut bucket00,
19315            &bucket01,
19316            &bucket02,
19317            &bucket03,
19318            &bucket04,
19319            &bucket05,
19320            &bucket06,
19321            &bucket07,
19322            &bucket08,
19323        );
19324        assert_eq!(bucket00, vec![376, 413, 450]);
19325    }
19326
19327    #[test]
19328    fn accumulate_counts_s32_matches_c_dispatch_for_small_bucket_counts() {
19329        let mut buckets = vec![1, 2, 3, 4, 5, 6, 7, 8];
19330        accumulate_counts_s32(&mut buckets, 2, 2, 4);
19331        assert_eq!(buckets, vec![1, 2, 3, 4, 5, 6, 16, 20]);
19332    }
19333
19334    #[test]
19335    fn accumulate_counts_s32_matches_c_dispatch_for_nine_buckets() {
19336        let mut buckets = vec![
19337            1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 6, 60, 7, 70, 8, 80, 9, 90,
19338        ];
19339        accumulate_counts_s32(&mut buckets, 2, 2, 9);
19340        assert_eq!(
19341            buckets,
19342            vec![1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 6, 60, 7, 70, 8, 80, 45, 450]
19343        );
19344    }
19345
19346    #[test]
19347    fn accumulate_counts_s32_matches_c_chunked_nine_then_tail_behavior() {
19348        let mut buckets = (1..=11).collect::<Vec<SaSint>>();
19349        accumulate_counts_s32(&mut buckets, 1, 1, 11);
19350        assert_eq!(buckets, vec![1, 2, 3, 4, 5, 6, 7, 8, 45, 10, 66]);
19351    }
19352}