flx_rs/
search.rs

1/**
2 * $File: search.rs $
3 * $Date: 2021-10-27 20:23:18 $
4 * $Revision: $
5 * $Creator: Jen-Chieh Shen $
6 * $Notice: See LICENSE.txt for modification and distribution information
7 *                   Copyright © 2021 by Shen, Jen-Chieh $
8 */
9use std::cmp::min;
10use std::collections::{HashMap, VecDeque};
11
12/// List of characters that act as word separators in flx.
13pub const WORD_SEPARATORS: [u32; 7] = [
14    ' ' as u32,
15    '-' as u32,
16    '_' as u32,
17    ':' as u32,
18    '.' as u32,
19    '/' as u32,
20    '\\' as u32,
21];
22
23/// Magic number for default +/- score.
24const DEFAULT_SCORE: i32 = -35;
25
26/// Check if char is a word character.
27///
28///  # Arguments
29///
30/// * `char` - Character we use to check for word.
31fn word(char: Option<u32>) -> bool {
32    if char.is_none() {
33        return false;
34    }
35    let ch: u32 = char.unwrap();
36    return !WORD_SEPARATORS.contains(&ch);
37}
38
39/// The `flx` compatible uppercase checker.
40///
41/// # Arguments
42///
43/// * `char` - The target character to check.
44fn is_uppercase(char: &Option<char>) -> bool {
45    if char.is_none() {
46        return false;
47    }
48    let ch_uw = char.unwrap();
49    return ch_uw == ch_uw.to_uppercase().next().unwrap();
50}
51
52/// Check if CHAR is an uppercase character."
53///
54///  # Arguments
55///
56/// * `char` - Character we use to check for capitalization.
57fn capital(char: Option<u32>) -> bool {
58    if char.is_none() {
59        return false;
60    }
61    let ch: Option<char> = char::from_u32(char.unwrap());
62    return word(char) && is_uppercase(&ch);
63}
64
65/// Check if LAST-CHAR is the end of a word and CHAR the start of the next.
66///
67/// This function is camel-case aware.
68fn boundary(last_char: Option<u32>, char: Option<u32>) -> bool {
69    if last_char.is_none() {
70        return true;
71    }
72    if !capital(last_char) && capital(char) {
73        return true;
74    }
75    if !word(last_char) && word(char) {
76        return true;
77    }
78    return false;
79}
80
81/// Increment each element in VEC between BEG and END by INC.
82fn inc_vec(vec: &mut Vec<i32>, inc: Option<i32>, beg: Option<i32>, end: Option<i32>) {
83    let _inc = inc.unwrap_or(1);
84    let mut _beg = beg.unwrap_or(0);
85    let _end = end.unwrap_or(vec.len() as i32);
86    while _beg < _end {
87        vec[_beg as usize] += _inc;
88        _beg += 1;
89    }
90}
91
92/// Return hash-table for string where keys are characters.
93/// Value is a sorted list of indexes for character occurrences.
94fn get_hash_for_string(result: &mut HashMap<Option<u32>, VecDeque<Option<u32>>>, str: &str) {
95    result.clear();
96    let str_len: i32 = str.chars().count() as i32;
97    let mut index: i32 = str_len - 1;
98    let mut char: Option<u32>;
99    let mut down_char: Option<u32>;
100
101    while 0 <= index {
102        char = Some(str.chars().nth(index as usize).unwrap() as u32);
103
104        if capital(char) {
105            result
106                .entry(char)
107                .or_insert_with(VecDeque::new)
108                .push_front(Some(index as u32));
109
110            let valid: Option<char> = char::from_u32(char.unwrap());
111            down_char = Some(valid.unwrap().to_lowercase().next().unwrap() as u32);
112        } else {
113            down_char = char;
114        }
115
116        result
117            .entry(down_char)
118            .or_insert_with(VecDeque::new)
119            .push_front(Some(index as u32));
120
121        index -= 1;
122    }
123}
124
125/// Generate the heatmap vector of string.
126///
127/// See documentation for logic.
128pub fn get_heatmap_str(scores: &mut Vec<i32>, str: &str, group_separator: Option<char>) {
129    let str_len: usize = str.chars().count();
130    let str_last_index: usize = str_len - 1;
131    scores.clear();
132    for _n in 0..str_len {
133        scores.push(DEFAULT_SCORE);
134    }
135    let penalty_lead: u32 = '.' as u32;
136    let mut group_alist: Vec<Vec<i32>> = vec![vec![-1, 0]];
137
138    // final char bonus
139    scores[str_last_index] += 1;
140
141    // Establish baseline mapping
142    let mut last_char: Option<u32> = None;
143    let mut group_word_count: i32 = 0;
144    let mut index1: usize = 0;
145
146    for char in str.chars() {
147        // before we find any words, all separaters are
148        // considered words of length 1.  This is so "foo/__ab"
149        // gets penalized compared to "foo/ab".
150        let effective_last_char: Option<u32> = if group_word_count == 0 {
151            None
152        } else {
153            last_char
154        };
155
156        if boundary(effective_last_char, Some(char as u32)) {
157            group_alist[0].insert(2, index1 as i32);
158        }
159
160        if !word(last_char) && word(Some(char as u32)) {
161            group_word_count += 1;
162        }
163
164        // ++++ -45 penalize extension
165        if last_char != None && last_char.unwrap() == penalty_lead {
166            scores[index1] += -45;
167        }
168
169        if group_separator != None && group_separator.unwrap() == char {
170            group_alist[0][1] = group_word_count;
171            group_word_count = 0;
172            group_alist.insert(0, vec![index1 as i32, group_word_count]);
173        }
174
175        if index1 == str_last_index {
176            group_alist[0][1] = group_word_count;
177        } else {
178            last_char = Some(char as u32);
179        }
180
181        index1 += 1;
182    }
183
184    let group_count: i32 = group_alist.len() as i32;
185    let separator_count: i32 = group_count - 1;
186
187    // ++++ slash group-count penalty
188    if separator_count != 0 {
189        inc_vec(scores, Some(group_count * -2), None, None);
190    }
191
192    let mut index2: i32 = separator_count;
193    let mut last_group_limit: Option<i32> = None;
194    let mut basepath_found: bool = false;
195
196    // score each group further
197    for group in group_alist {
198        let group_start: i32 = group[0];
199        let word_count: i32 = group[1];
200        // this is the number of effective word groups
201        let words_length: usize = group.len() - 2;
202        let mut basepath_p: bool = false;
203
204        if words_length != 0 && !basepath_found {
205            basepath_found = true;
206            basepath_p = true;
207        }
208
209        let num: i32;
210        if basepath_p {
211            // ++++ basepath separator-count boosts
212            let mut boosts: i32 = 0;
213            if separator_count > 1 {
214                boosts = separator_count - 1;
215            }
216            // ++++ basepath word count penalty
217            let penalty: i32 = -word_count;
218            num = 35 + boosts + penalty;
219        }
220        // ++++ non-basepath penalties
221        else {
222            if index2 == 0 {
223                num = -3;
224            } else {
225                num = -5 + ((index2 as i32) - 1);
226            }
227        }
228
229        inc_vec(scores, Some(num), Some(group_start + 1), last_group_limit);
230
231        let mut cddr_group: Vec<i32> = group.clone();
232        cddr_group.remove(0);
233        cddr_group.remove(0);
234        let mut word_index: i32 = (words_length - 1) as i32;
235        let mut last_word: i32 = if last_group_limit != None {
236            last_group_limit.unwrap()
237        } else {
238            str_len as i32
239        };
240
241        for word in cddr_group {
242            // ++++  beg word bonus AND
243            scores[word as usize] += 85;
244
245            let mut index3: i32 = word;
246            let mut char_i: i32 = 0;
247            while index3 < last_word {
248                scores[index3 as usize] += (-3 * word_index) -  // ++++ word order penalty
249                    char_i; // ++++ char order penalty
250                char_i += 1;
251                index3 += 1;
252            }
253
254            last_word = word;
255            word_index -= 1;
256        }
257
258        last_group_limit = Some(group_start + 1);
259        index2 -= 1;
260    }
261}
262
263/// Return sublist bigger than VAL from sorted SORTED-LIST.
264///
265/// If VAL is nil, return entire list.
266fn bigger_sublist(
267    result: &mut VecDeque<Option<u32>>,
268    sorted_list: Option<&VecDeque<Option<u32>>>,
269    val: Option<u32>,
270) {
271    if sorted_list == None {
272        return;
273    }
274    let sl: &VecDeque<Option<u32>> = sorted_list.unwrap();
275    if val != None {
276        let v: u32 = val.unwrap();
277        for sub in sl {
278            if sub.unwrap() > v {
279                result.push_back(Some(sub.unwrap()));
280            }
281        }
282    } else {
283        for sub in sl {
284            result.push_back(Some(sub.unwrap()));
285        }
286    }
287}
288
289#[derive(Debug, Clone)]
290pub struct Result {
291    pub indices: Vec<i32>,
292    pub score: i32,
293    pub tail: i32,
294}
295
296impl Result {
297    pub fn new(indices: Vec<i32>, score: i32, tail: i32) -> Result {
298        Result {
299            indices,
300            score,
301            tail,
302        }
303    }
304}
305
306/// Recursively compute the best match for a string, passed as STR-INFO and
307/// HEATMAP, according to QUERY.
308pub fn find_best_match(
309    imatch: &mut Vec<Result>,
310    str_info: HashMap<Option<u32>, VecDeque<Option<u32>>>,
311    heatmap: Vec<i32>,
312    greater_than: Option<u32>,
313    query: &str,
314    query_length: i32,
315    q_index: i32,
316    match_cache: &mut HashMap<u32, Vec<Result>>,
317) {
318    let greater_num: u32 = if greater_than != None {
319        greater_than.unwrap()
320    } else {
321        0
322    };
323    let hash_key: u32 = q_index as u32 + (greater_num * query_length as u32);
324    let hash_value: Option<&Vec<Result>> = match_cache.get(&hash_key);
325
326    if !hash_value.is_none() {
327        // Process match_cache here
328        imatch.clear();
329        for val in hash_value.unwrap() {
330            imatch.push(val.clone());
331        }
332    } else {
333        let uchar: Option<u32> = Some(query.chars().nth(q_index as usize).unwrap() as u32);
334        let sorted_list: Option<&VecDeque<Option<u32>>> = str_info.get(&uchar);
335        let mut indexes: VecDeque<Option<u32>> = VecDeque::new();
336        bigger_sublist(&mut indexes, sorted_list, greater_than);
337        let mut temp_score: i32;
338        let mut best_score: i32 = std::f32::NEG_INFINITY as i32;
339
340        if q_index >= query_length - 1 {
341            // At the tail end of the recursion, simply generate all possible
342            // matches with their scores and return the list to parent.
343            for index in indexes {
344                let mut indices: Vec<i32> = Vec::new();
345                let idx: i32 = index.unwrap() as i32;
346                indices.push(idx);
347                imatch.push(Result::new(indices, heatmap[idx as usize], 0));
348            }
349        } else {
350            for index in indexes {
351                let idx: i32 = index.unwrap() as i32;
352                let mut elem_group: Vec<Result> = Vec::new();
353                find_best_match(
354                    &mut elem_group,
355                    str_info.clone(),
356                    heatmap.clone(),
357                    Some(idx as u32),
358                    query,
359                    query_length,
360                    q_index + 1,
361                    match_cache,
362                );
363
364                for elem in elem_group {
365                    let caar: i32 = elem.indices[0];
366                    let cadr: i32 = elem.score;
367                    let cddr: i32 = elem.tail;
368
369                    if (caar - 1) == idx {
370                        temp_score = cadr + heatmap[idx as usize] +
371                            (min(cddr, 3) * 15) +  // boost contiguous matches
372                            60;
373                    } else {
374                        temp_score = cadr + heatmap[idx as usize];
375                    }
376
377                    // We only care about the optimal match, so only forward the match
378                    // with the best score to parent
379                    if temp_score > best_score {
380                        best_score = temp_score;
381
382                        imatch.clear();
383                        let mut indices: Vec<i32> = elem.indices.clone();
384                        indices.insert(0, idx);
385                        let mut tail: i32 = 0;
386                        if (caar - 1) == idx {
387                            tail = cddr + 1;
388                        }
389                        imatch.push(Result::new(indices, temp_score, tail));
390                    }
391                }
392            }
393        }
394
395        // Calls are cached to avoid exponential time complexity
396        match_cache.insert(hash_key, imatch.clone());
397    }
398}
399
400/// Return best score matching QUERY against STR.
401pub fn score(str: &str, query: &str) -> Option<Result> {
402    if str.is_empty() || query.is_empty() {
403        return None;
404    }
405    let mut str_info: HashMap<Option<u32>, VecDeque<Option<u32>>> = HashMap::new();
406    get_hash_for_string(&mut str_info, str);
407
408    let mut heatmap: Vec<i32> = Vec::new();
409    get_heatmap_str(&mut heatmap, str, None);
410
411    let query_length: i32 = query.chars().count() as i32;
412    let full_match_boost: bool = (1 < query_length) && (query_length < 5);
413    let mut match_cache: HashMap<u32, Vec<Result>> = HashMap::new();
414    let mut optimal_match: Vec<Result> = Vec::new();
415    find_best_match(
416        &mut optimal_match,
417        str_info,
418        heatmap,
419        None,
420        query,
421        query_length,
422        0,
423        &mut match_cache,
424    );
425
426    if optimal_match.is_empty() {
427        return None;
428    }
429
430    let mut result_1: Result = optimal_match[0].clone();
431    let caar: usize = result_1.indices.len();
432
433    if full_match_boost && caar == str.chars().count() {
434        result_1.score += 10000;
435    }
436
437    return Some(result_1);
438}