Skip to main content

caps_sa/
lib.rs

1//! Cache-friendly, parallel, sample-sort-based suffix array construction.
2//!
3//! This crate is a Rust port of [CaPS-SA] (Khan et al., WABI 2023), a parallel
4//! and cache-friendly suffix-array constructor based on sample sort with
5//! LCP-enhanced comparison.
6//!
7//! The crate is generic over the symbol type (`u8`, `u16`, …; any `Ord + Copy`)
8//! and the index type (`u32`, `u64`; via the [`Index`] trait). It produces a
9//! standard lexicographic suffix array. Callers who need a *generalized* suffix
10//! array (multiple strings, sentinel-terminated) should rewrite their text with
11//! distinct sentinels before invoking — the resulting standard SA over the
12//! transformed text is then the generalized SA they want.
13//!
14//! Phase 1 of the port provides the **in-memory** algorithm; the external-memory
15//! variant (disk-spilling buckets) is layered on top in a later phase.
16//!
17//! [CaPS-SA]: https://github.com/jamshed/CaPS-SA
18
19mod ext_bucket;
20mod ext_mem;
21mod lcp;
22mod limits;
23mod sample_sort;
24
25pub use ext_mem::{
26    ExtMemOpts, build_ext_mem, build_ext_mem_for_filter, build_ext_mem_for_filter_with,
27    build_ext_mem_for_positions, build_ext_mem_for_positions_with, build_ext_mem_with,
28    build_in_memory_sample_sort, build_in_memory_sample_sort_for_positions,
29    build_in_memory_sample_sort_for_positions_with, build_in_memory_sample_sort_with,
30};
31pub use lcp::{LcpDispatch, Symbol, lcp, lcp_scalar, lcp_u8, suffix_cmp};
32pub use limits::{LimitProvider, PlainText, SegmentedText};
33pub use sample_sort::{
34    Opts, build_in_memory, build_in_memory_for_positions, build_in_memory_for_positions_with,
35    build_in_memory_for_positions_with_opts, build_in_memory_with, build_in_memory_with_opts,
36};
37
38/// Trait implemented by integer types usable as suffix array indices.
39///
40/// Provided for `u32`, `u64`, and `usize`. Callers pick the narrowest type
41/// large enough to address their text.
42pub trait Index:
43    Copy
44    + Eq
45    + Ord
46    + Send
47    + Sync
48    + std::fmt::Debug
49    + std::ops::Add<Output = Self>
50    + std::ops::Sub<Output = Self>
51{
52    /// Convert from `usize`. Panics if the value does not fit.
53    fn from_usize(v: usize) -> Self;
54    /// Convert to `usize`. Lossless for `u32`/`u64`/`usize` on 64-bit targets.
55    fn to_usize(self) -> usize;
56    /// The zero value.
57    fn zero() -> Self;
58}
59
60macro_rules! impl_index {
61    ($t:ty) => {
62        impl Index for $t {
63            #[inline]
64            fn from_usize(v: usize) -> Self {
65                v as $t
66            }
67            #[inline]
68            fn to_usize(self) -> usize {
69                self as usize
70            }
71            #[inline]
72            fn zero() -> Self {
73                0
74            }
75        }
76    };
77}
78
79impl_index!(u32);
80impl_index!(u64);
81impl_index!(usize);