1use adler32::RollingAdler32;
2use bitvec::prelude::*;
3use std::collections::HashMap;
4
5#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
6pub struct Match {
7 pub pattern_index: usize,
8 pub text_index: usize,
9 pub length: usize,
10}
11
12struct RkrGst<'a> {
13 pattern: &'a [u8],
14 text: &'a [u8],
15 pattern_mark: BitVec,
16 text_mark: BitVec,
17 matches: Vec<Match>,
18 result: Vec<Match>,
19}
20
21impl<'a> RkrGst<'a> {
22 fn scan_pattern(&mut self, search_length: usize) -> usize {
23 let mut map: HashMap<u32, Vec<usize>> = HashMap::new();
25 let mut i = 0;
26 while (i + search_length) <= self.text.len() {
27 for j in i..(i + search_length) {
29 if self.text_mark[j] {
30 i = j + 1;
31 break;
32 }
33 }
34 if i + search_length > self.text.len() {
35 break;
36 }
37
38 let mut hash = RollingAdler32::new();
40 for j in i..(i + search_length) {
41 hash.update(self.text[j]);
42 }
43
44 loop {
46 if self.text_mark[i + search_length - 1] {
47 break;
48 }
49 map.entry(hash.hash()).or_insert_with(Vec::new).push(i);
50 i += 1;
51 if i + search_length > self.text.len() {
52 break;
53 }
54 hash.remove(search_length, self.text[i - 1]);
55 hash.update(self.text[i + search_length - 1]);
56 }
57 }
58
59 self.matches.clear();
61 let mut max_match = 0;
62 i = 0;
63 while (i + search_length) <= self.pattern.len() {
64 for j in i..(i + search_length) {
66 if self.pattern_mark[j] {
67 i = j + 1;
68 break;
69 }
70 }
71 if i + search_length > self.pattern.len() {
72 break;
73 }
74
75 let mut hash = RollingAdler32::new();
77 for j in i..(i + search_length) {
78 hash.update(self.pattern[j]);
79 }
80
81 loop {
83 if self.pattern_mark[i + search_length - 1] {
84 break;
85 }
86 if map.contains_key(&hash.hash()) {
87 for text_index in &map[&hash.hash()] {
90 let pattern_index = i;
91 let mut k = 0;
92 while *text_index + k < self.text.len()
93 && pattern_index + k < self.pattern.len()
94 && self.text[text_index + k] == self.pattern[pattern_index + k]
95 && !self.text_mark[text_index + k]
96 && !self.pattern_mark[pattern_index + k]
97 {
98 k += 1;
99 }
100
101 if k > 2 * search_length {
102 return k;
103 }
104
105 if k >= search_length {
106 self.matches.push(Match {
107 pattern_index,
108 text_index: *text_index,
109 length: k,
110 });
111 max_match = std::cmp::max(max_match, k);
112 }
113 }
114 }
115
116 i += 1;
117 if i + search_length > self.pattern.len() {
118 break;
119 }
120 hash.remove(search_length, self.pattern[i - 1]);
121 hash.update(self.pattern[i + search_length - 1]);
122 }
123 }
124
125 max_match
126 }
127
128 fn mark_strings(&mut self) {
129 self.matches.sort_by(|a, b| b.length.cmp(&a.length));
131 for m in &self.matches {
132 let mut unmarked = true;
133 for i in 0..m.length {
134 if self.text_mark[m.text_index + i] || self.pattern_mark[m.pattern_index + i] {
135 unmarked = false;
136 break;
137 }
138 }
139
140 if unmarked {
141 self.result.push(*m);
142 for i in 0..m.length {
143 self.text_mark.set(m.text_index + i, true);
144 self.pattern_mark.set(m.pattern_index + i, true);
145 }
146 }
147 }
148 self.matches.clear();
149 }
150}
151
152pub fn run(
153 pattern: &[u8],
154 text: &[u8],
155 initial_search_length: usize,
156 minimum_match_length: usize,
157) -> Vec<Match> {
158 let mut s = initial_search_length;
159 let mut params = RkrGst {
160 pattern,
161 text,
162 pattern_mark: bitvec![0; pattern.len()],
163 text_mark: bitvec![0; text.len()],
164 matches: vec![],
165 result: vec![],
166 };
167 loop {
168 let lmax = params.scan_pattern(s);
170 if lmax > 2 * s {
172 s = lmax;
174 } else {
175 params.mark_strings();
177 if s > 2 * minimum_match_length {
179 s /= 2;
181 } else if s > minimum_match_length {
182 s = minimum_match_length;
185 } else {
186 break;
188 }
189 }
190 }
191
192 params.result
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn simple_match() {
201 assert_eq!(
202 run("lower".as_bytes(), "yellow".as_bytes(), 3, 2),
203 vec![Match {
204 pattern_index: 0,
205 text_index: 3,
206 length: 3
207 }]
208 );
209 }
210
211 #[test]
212 fn duplicate_match() {
213 assert_eq!(
214 run("lowerlow".as_bytes(), "yellow lowlow".as_bytes(), 3, 2),
215 vec![
216 Match {
217 pattern_index: 0,
218 text_index: 3,
219 length: 3
220 },
221 Match {
222 pattern_index: 5,
223 text_index: 7,
224 length: 3
225 }
226 ]
227 );
228 }
229}