1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
//! MATCH FUNCTIONS
use std::collections::HashMap;
use crate::DiffMatchPatch;
impl DiffMatchPatch {
/**
Locate the best instance of 'pattern' in 'text' near 'loc'.
Args:
text: The text to search.
pattern: The pattern to search for.
loc: The location to search around.
Returns:
Best match index or -1.
*/
pub fn match_main(&mut self, text: &[char], pattern: &[char], mut loc: usize) -> Option<usize> {
loc = usize::min(loc, text.len());
if text == pattern {
// Shortcut (potentially not guaranteed by the algorithm)
return Some(0);
} else if text.is_empty() {
// Nothing to match.
return None;
} else if loc + pattern.len() <= text.len()
&& text[(loc)..(loc + pattern.len())].to_vec() == pattern
{
// Perfect match at the perfect spot! (Includes case of null pattern)
return Some(loc);
}
self.match_bitap(text, pattern, loc)
}
/**
Locate the best instance of 'pattern' in 'text' near 'loc' using the
Bitap algorithm.
Args:
text: The text to search.
pattern: The pattern to search for.
loc: The location to search around.
Returns:
Best match index or -1.
*/
pub fn match_bitap(&mut self, text: &[char], patern: &[char], loc: usize) -> Option<usize> {
// check for maxbits limit.
if !(self.match_maxbits == 0 || patern.len() <= self.match_maxbits) {
panic!("patern too long for this application");
}
// Initialise the alphabet.
let s: HashMap<char, i32> = self.match_alphabet(patern);
// Highest score beyond which we give up.
let mut score_threshold: f32 = self.match_threshold;
// Is there a nearby exact match? (speedup)
let best_loc = self.kmp(text, patern, loc);
if let Some(best_loc) = best_loc {
score_threshold = f32::min(
self.match_bitap_score(0, best_loc as i32, loc as i32, patern),
score_threshold,
);
// What about in the other direction? (speedup)
if let Some(best_loc) = self.rkmp(text, patern, loc + patern.len()) {
score_threshold = f32::min(
score_threshold,
self.match_bitap_score(0, best_loc as i32, loc as i32, patern),
);
}
}
// Initialise the bit arrays.
let matchmask = 1 << (patern.len() - 1); //>
let mut best_loc: i32 = -1;
let mut bin_min: i32;
let mut bin_mid: i32;
let mut bin_max: i32 = (patern.len() + text.len()) as i32;
// Empty initialization added to appease pychecker.
let mut last_rd: Vec<i32> = vec![];
for d in 0..patern.len() {
/*
Scan for the best match each iteration allows for one more error.
Run a binary search to determine how far from 'loc' we can stray at
this error level.
*/
let mut rd: Vec<i32> = vec![];
bin_min = 0;
bin_mid = bin_max;
// Use the result from this iteration as the maximum for the next.
while bin_min < bin_mid {
if self.match_bitap_score(d as i32, (loc as i32) + bin_mid, loc as i32, patern)
<= score_threshold
{
bin_min = bin_mid;
} else {
bin_max = bin_mid;
}
bin_mid = bin_min + (bin_max - bin_min) / 2;
}
bin_max = bin_mid;
let mut start = i32::max(1, loc as i32 - bin_mid + 1);
let finish = i32::min(loc as i32 + bin_mid, text.len() as i32) + patern.len() as i32;
rd.resize((finish + 2) as usize, 0);
rd[(finish + 1) as usize] = (1 << d) - 1; //>
let mut j = finish;
while j >= start {
let char_match: i32;
if text.len() < j as usize {
// Out of range.
char_match = 0;
} else {
// Subsequent passes: fuzzy match.
match s.get(&(text[j as usize - 1])) {
Some(num) => {
char_match = *num;
}
None => {
char_match = 0;
}
}
}
if d == 0 {
// First pass: exact match.
rd[j as usize] = ((rd[j as usize + 1] << 1) | 1) & char_match;
//>>
} else {
rd[j as usize] = (((rd[j as usize + 1] << 1) | 1) & char_match)
| (((last_rd[j as usize + 1] | last_rd[j as usize]) << 1) | 1)
| last_rd[j as usize + 1]; //>>>>
}
if (rd[j as usize] & matchmask) != 0 {
let score: f32 = self.match_bitap_score(d as i32, j - 1, loc as i32, patern);
// This match will almost certainly be better than any existing match.
// But check anyway.
if score <= score_threshold {
// Told you so.
score_threshold = score;
best_loc = j - 1;
if best_loc as usize > loc {
// When passing loc, don't exceed our current distance from loc.
start = i32::max(1, 2 * loc as i32 - best_loc);
} else {
// Already passed loc, downhill from here on in.
break;
}
}
}
j -= 1;
}
// No hope for a (better) match at greater error levels.
if self.match_bitap_score(d as i32 + 1, loc as i32, loc as i32, patern)
> score_threshold
{
break;
}
last_rd = rd;
}
if best_loc == -1 {
None
} else {
Some(best_loc as usize)
}
}
/**
Initialise the alphabet for the Bitap algorithm.
Args:
pattern: The text to encode.
Returns:
Hash of character locations.
*/
pub fn match_alphabet(&mut self, patern: &[char]) -> HashMap<char, i32> {
let mut s: HashMap<char, i32> = HashMap::new();
for patern_item in patern {
s.insert(*patern_item, 0);
}
for i in 0..patern.len() {
let ch: char = patern[i];
let mut temp: i32 = 0;
if let Some(num) = s.get(&ch) {
temp = num | (1 << (patern.len() - i - 1)); //>>
}
s.insert(ch, temp);
}
s
}
/**
Compute and return the score for a match with e errors and x location.
Accesses loc and pattern through being a closure.
Args:
e: Number of errors in match.
x: Location of match.
Returns:
Overall score for match (0.0 = good, 1.0 = bad).
*/
fn match_bitap_score(&mut self, e: i32, x: i32, loc: i32, patern: &[char]) -> f32 {
let accuracy: f32 = (e as f32) / (patern.len() as f32);
let proximity: i32 = (loc - x).abs();
if self.match_distance == 0 {
// Dodge divide by zero error.
if proximity == 0 {
return accuracy;
} else {
return 1.0;
}
}
accuracy + ((proximity as f32) / (self.match_distance as f32))
}
}