1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
use num_traits::ToPrimitive;
use rayon::prelude::*;
use sacabase::{LongestCommonSubstring, StringIndex, SuffixArray};
pub struct PartitionedSuffixArray<'a, Index>
where
Index: ToPrimitive + Send,
{
partition_size: usize,
text: &'a [u8],
sas: Vec<SuffixArray<'a, Index>>,
}
impl<'a, Index> PartitionedSuffixArray<'a, Index>
where
Index: ToPrimitive + Send,
{
pub fn new<F>(text: &'a [u8], num_partitions: usize, f: F) -> Self
where
F: Fn(&'a [u8]) -> SuffixArray<'a, Index> + Sync,
{
let partition_size = text.len() / num_partitions + 1;
let mut sas: Vec<_> = text
.par_chunks(text.len() / num_partitions + 1)
.enumerate()
.map(|(i, chunk)| (i, f(chunk)))
.collect();
sas.sort_by(|(i, _), (j, _)| i.cmp(j));
let sas = sas.into_iter().map(|(_, chunk)| chunk).collect();
Self {
partition_size,
text,
sas,
}
}
pub fn num_partitions(&self) -> usize {
self.sas.len()
}
}
impl<'a, Index> StringIndex<'a> for PartitionedSuffixArray<'a, Index>
where
Index: ToPrimitive + Send,
{
fn longest_substring_match(&self, needle: &[u8]) -> LongestCommonSubstring<'a> {
let mut best_lcs: Option<LongestCommonSubstring> = None;
for (i, sa) in self.sas.iter().enumerate() {
let mut lcs = sa.longest_substring_match(needle);
let offset = i * self.partition_size;
let may_extend = lcs.start + lcs.len == sa.text().len();
lcs.start += offset;
lcs.text = self.text;
if may_extend {
lcs.len = sacabase::common_prefix_len(&self.text[lcs.start..], needle);
}
let replace = match best_lcs {
None => true,
Some(ref prev_lcs) => lcs.len > prev_lcs.len,
};
if replace {
best_lcs.replace(lcs);
}
}
best_lcs.expect(
"partitioned suffix arrays should always find at least one longest common substring",
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use sacabase::StringIndex;
#[test]
fn worse_test() {
let input = "totor";
let sa_full = divsufsort::sort(input.as_bytes());
let sa_part = PartitionedSuffixArray::new(input.as_bytes(), 2, divsufsort::sort);
let needle = "tor";
let full_match = sa_full.longest_substring_match(needle.as_bytes());
assert_eq!(needle.as_bytes(), full_match.as_bytes());
let part_match = sa_part.longest_substring_match(needle.as_bytes());
assert_eq!(needle[..2].as_bytes(), part_match.as_bytes());
let needle = "otor";
let full_match = sa_full.longest_substring_match(needle.as_bytes());
assert_eq!(needle.as_bytes(), full_match.as_bytes());
let part_match = sa_part.longest_substring_match(needle.as_bytes());
assert_eq!(needle.as_bytes(), part_match.as_bytes());
}
#[test]
fn equivalent_test() {
let input = "This is a rather long text. We can probably find matches that span two partitions. Oh yes.";
let sa_full = divsufsort::sort(input.as_bytes());
for &partitions in &[1, 2, 3] {
println!("{} partitions", partitions);
for needle in &[
"rather long",
"text. We can",
"We can probably find matches that span",
] {
println!("needle: {:?}", needle);
let sa_part =
PartitionedSuffixArray::new(input.as_bytes(), partitions, divsufsort::sort);
let full_match = sa_full.longest_substring_match(needle.as_bytes());
let part_match = sa_part.longest_substring_match(needle.as_bytes());
assert_eq!(
full_match.as_bytes(),
part_match.as_bytes(),
"should find same match bytes for {:?}",
needle
);
assert_eq!(
full_match.start, part_match.start,
"should find same match start for {:?}",
needle
);
assert_eq!(
full_match.len, part_match.len,
"should find same match len for {:?}",
needle
);
}
}
}
}