rkr_gst/
lib.rs

1use adler32::RollingAdler32;
2use bitvec::prelude::*;
3use std::collections::HashMap;
4
5#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)]
6pub struct Match {
7    pub pattern_index: usize,
8    pub text_index: usize,
9    pub length: usize,
10}
11
12struct RkrGst<'a> {
13    pattern: &'a [u8],
14    text: &'a [u8],
15    pattern_mark: BitVec,
16    text_mark: BitVec,
17    matches: Vec<Match>,
18    result: Vec<Match>,
19}
20
21impl<'a> RkrGst<'a> {
22    fn scan_pattern(&mut self, search_length: usize) -> usize {
23        // map text hashes => text index
24        let mut map: HashMap<u32, Vec<usize>> = HashMap::new();
25        let mut i = 0;
26        while (i + search_length) <= self.text.len() {
27            // jump to first unmarked token
28            for j in i..(i + search_length) {
29                if self.text_mark[j] {
30                    i = j + 1;
31                    break;
32                }
33            }
34            if i + search_length > self.text.len() {
35                break;
36            }
37
38            // text[i..i+search_length] is unmarked
39            let mut hash = RollingAdler32::new();
40            for j in i..(i + search_length) {
41                hash.update(self.text[j]);
42            }
43
44            // advance until next marked
45            loop {
46                if self.text_mark[i + search_length - 1] {
47                    break;
48                }
49                map.entry(hash.hash()).or_insert_with(Vec::new).push(i);
50                i += 1;
51                if i + search_length > self.text.len() {
52                    break;
53                }
54                hash.remove(search_length, self.text[i - 1]);
55                hash.update(self.text[i + search_length - 1]);
56            }
57        }
58
59        // search patterns
60        self.matches.clear();
61        let mut max_match = 0;
62        i = 0;
63        while (i + search_length) <= self.pattern.len() {
64            // jump to first unmarked token
65            for j in i..(i + search_length) {
66                if self.pattern_mark[j] {
67                    i = j + 1;
68                    break;
69                }
70            }
71            if i + search_length > self.pattern.len() {
72                break;
73            }
74
75            // pattern[i..i+search_length] is unmarked
76            let mut hash = RollingAdler32::new();
77            for j in i..(i + search_length) {
78                hash.update(self.pattern[j]);
79            }
80
81            // advance until next marked
82            loop {
83                if self.pattern_mark[i + search_length - 1] {
84                    break;
85                }
86                if map.contains_key(&hash.hash()) {
87                    // found a match, check that it really matches
88                    // and try to extend
89                    for text_index in &map[&hash.hash()] {
90                        let pattern_index = i;
91                        let mut k = 0;
92                        while *text_index + k < self.text.len()
93                            && pattern_index + k < self.pattern.len()
94                            && self.text[text_index + k] == self.pattern[pattern_index + k]
95                            && !self.text_mark[text_index + k]
96                            && !self.pattern_mark[pattern_index + k]
97                        {
98                            k += 1;
99                        }
100
101                        if k > 2 * search_length {
102                            return k;
103                        }
104
105                        if k >= search_length {
106                            self.matches.push(Match {
107                                pattern_index,
108                                text_index: *text_index,
109                                length: k,
110                            });
111                            max_match = std::cmp::max(max_match, k);
112                        }
113                    }
114                }
115
116                i += 1;
117                if i + search_length > self.pattern.len() {
118                    break;
119                }
120                hash.remove(search_length, self.pattern[i - 1]);
121                hash.update(self.pattern[i + search_length - 1]);
122            }
123        }
124
125        max_match
126    }
127
128    fn mark_strings(&mut self) {
129        // sort by length, desc
130        self.matches.sort_by(|a, b| b.length.cmp(&a.length));
131        for m in &self.matches {
132            let mut unmarked = true;
133            for i in 0..m.length {
134                if self.text_mark[m.text_index + i] || self.pattern_mark[m.pattern_index + i] {
135                    unmarked = false;
136                    break;
137                }
138            }
139
140            if unmarked {
141                self.result.push(*m);
142                for i in 0..m.length {
143                    self.text_mark.set(m.text_index + i, true);
144                    self.pattern_mark.set(m.pattern_index + i, true);
145                }
146            }
147        }
148        self.matches.clear();
149    }
150}
151
152pub fn run(
153    pattern: &[u8],
154    text: &[u8],
155    initial_search_length: usize,
156    minimum_match_length: usize,
157) -> Vec<Match> {
158    let mut s = initial_search_length;
159    let mut params = RkrGst {
160        pattern,
161        text,
162        pattern_mark: bitvec![0; pattern.len()],
163        text_mark: bitvec![0; text.len()],
164        matches: vec![],
165        result: vec![],
166    };
167    loop {
168        // Lmax := scanpatterns(s)
169        let lmax = params.scan_pattern(s);
170        // if Lmax > 2 x s
171        if lmax > 2 * s {
172            // then s := Lmax
173            s = lmax;
174        } else {
175            // markarrays(s)
176            params.mark_strings();
177            // if s > 2 x minimum_match_length
178            if s > 2 * minimum_match_length {
179                // s := s div 2
180                s /= 2;
181            } else if s > minimum_match_length {
182                // else if s > minimum_match_length
183                // s := minimum_match_length
184                s = minimum_match_length;
185            } else {
186                // stop := true
187                break;
188            }
189        }
190    }
191
192    params.result
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn simple_match() {
201        assert_eq!(
202            run("lower".as_bytes(), "yellow".as_bytes(), 3, 2),
203            vec![Match {
204                pattern_index: 0,
205                text_index: 3,
206                length: 3
207            }]
208        );
209    }
210
211    #[test]
212    fn duplicate_match() {
213        assert_eq!(
214            run("lowerlow".as_bytes(), "yellow lowlow".as_bytes(), 3, 2),
215            vec![
216                Match {
217                    pattern_index: 0,
218                    text_index: 3,
219                    length: 3
220                },
221                Match {
222                    pattern_index: 5,
223                    text_index: 7,
224                    length: 3
225                }
226            ]
227        );
228    }
229}