caps_sa/lib.rs
1//! Cache-friendly, parallel, sample-sort-based suffix array construction.
2//!
3//! This crate is a Rust port of [CaPS-SA] (Khan et al., WABI 2023), a parallel
4//! and cache-friendly suffix-array constructor based on sample sort with
5//! LCP-enhanced comparison.
6//!
7//! The crate is generic over the symbol type (`u8`, `u16`, …; any `Ord + Copy`)
8//! and the index type (`u32`, `u64`; via the [`Index`] trait). It produces a
9//! standard lexicographic suffix array. Callers who need a *generalized* suffix
10//! array (multiple strings, sentinel-terminated) should rewrite their text with
11//! distinct sentinels before invoking — the resulting standard SA over the
12//! transformed text is then the generalized SA they want.
13//!
14//! Phase 1 of the port provides the **in-memory** algorithm; the external-memory
15//! variant (disk-spilling buckets) is layered on top in a later phase.
16//!
17//! [CaPS-SA]: https://github.com/jamshed/CaPS-SA
18
19mod ext_bucket;
20mod ext_mem;
21mod lcp;
22mod limits;
23mod sample_sort;
24
25pub use ext_mem::{
26 ExtMemOpts, build_ext_mem, build_ext_mem_for_positions, build_ext_mem_for_positions_with,
27 build_ext_mem_with, build_in_memory_sample_sort, build_in_memory_sample_sort_for_positions,
28 build_in_memory_sample_sort_for_positions_with, build_in_memory_sample_sort_with,
29};
30pub use lcp::{LcpDispatch, Symbol, lcp, lcp_scalar, lcp_u8, suffix_cmp};
31pub use limits::{LimitProvider, PlainText, SegmentedText};
32pub use sample_sort::{
33 Opts, build_in_memory, build_in_memory_for_positions, build_in_memory_for_positions_with,
34 build_in_memory_for_positions_with_opts, build_in_memory_with, build_in_memory_with_opts,
35};
36
37/// Trait implemented by integer types usable as suffix array indices.
38///
39/// Provided for `u32`, `u64`, and `usize`. Callers pick the narrowest type
40/// large enough to address their text.
41pub trait Index:
42 Copy
43 + Eq
44 + Ord
45 + Send
46 + Sync
47 + std::fmt::Debug
48 + std::ops::Add<Output = Self>
49 + std::ops::Sub<Output = Self>
50{
51 /// Convert from `usize`. Panics if the value does not fit.
52 fn from_usize(v: usize) -> Self;
53 /// Convert to `usize`. Lossless for `u32`/`u64`/`usize` on 64-bit targets.
54 fn to_usize(self) -> usize;
55 /// The zero value.
56 fn zero() -> Self;
57}
58
59macro_rules! impl_index {
60 ($t:ty) => {
61 impl Index for $t {
62 #[inline]
63 fn from_usize(v: usize) -> Self {
64 v as $t
65 }
66 #[inline]
67 fn to_usize(self) -> usize {
68 self as usize
69 }
70 #[inline]
71 fn zero() -> Self {
72 0
73 }
74 }
75 };
76}
77
78impl_index!(u32);
79impl_index!(u64);
80impl_index!(usize);