minidex/search/
scoring.rs

1use crate::Kind;
2
3/// Scoring configuration
4#[derive(Debug)]
5pub struct ScoringConfig {
6    /// Weights available for tweaking scoring
7    pub weights: Option<ScoringWeights>,
8    /// Scoring function
9    pub scoring_fn: fn(&ScoringWeights, &ScoringInputs) -> f64,
10}
11
12impl Default for ScoringConfig {
13    fn default() -> Self {
14        Self {
15            weights: Default::default(),
16            scoring_fn: compute_score,
17        }
18    }
19}
20
21/// Configurable weights for search result scoring.
22#[derive(Debug, Clone, Copy)]
23pub struct ScoringWeights {
24    /// Token coverage ratio
25    pub token_coverage: f64,
26    /// Exact token match (not just prefix match)
27    pub exact_match: f64,
28    /// Token is the entire file or directory name
29    pub exact_filename_match: f64,
30    /// Token is the exact stem
31    pub exact_stem_match: f64,
32    /// Query token matching in file name
33    pub filename_match: f64,
34    /// Filename prefix match
35    pub filename_prefix_match: f64,
36    /// Boost token appearing before in path
37    pub path_prefix_match: f64,
38    /// Penalty for token appearing in middle of word rather than prefix
39    pub midword_penalty: f64,
40    /// Maximum recency boost (decays logarithmically)
41    pub recency_boost: f64,
42    /// Recency decay rate
43    pub recency_decay: f64,
44    /// File boost (vs directories).
45    pub kind_file_boost: f64,
46    /// Directory boost (vs files).
47    pub kind_dir_boost: f64,
48    /// Boost by proximity scoring
49    pub proximity_bonus: f64,
50    /// Boost by token ordering
51    pub ordering_bonus: f64,
52}
53
54impl Default for ScoringWeights {
55    fn default() -> Self {
56        Self {
57            token_coverage: 30.0,
58            exact_match: 10.0,
59            exact_filename_match: 100.0,
60            exact_stem_match: 70.0,
61            filename_match: 15.0,
62            filename_prefix_match: 50.0,
63            path_prefix_match: 20.0,
64            midword_penalty: 30.0,
65            recency_boost: 10.0,
66            recency_decay: 2.0,
67            kind_file_boost: 2.0,
68            kind_dir_boost: 2.0,
69            proximity_bonus: 20.0,
70            ordering_bonus: 15.0,
71        }
72    }
73}
74
75#[derive(Debug)]
76pub struct ScoringInputs<'a> {
77    pub path: &'a str,
78    pub query_tokens: &'a [String],
79    pub raw_query_tokens: &'a [&'a str],
80    pub last_modified: u64,
81    pub last_accessed: u64,
82    pub kind: Kind,
83    pub now_micros: f64,
84}
85
86pub(crate) fn compute_score(weights: &ScoringWeights, inputs: &ScoringInputs) -> f64 {
87    let normalized = if inputs.path.is_ascii() {
88        inputs.path.to_lowercase()
89    } else {
90        crate::tokenizer::fold_path(inputs.path)
91    };
92
93    let trimmed_path = normalized.trim_end_matches(std::path::MAIN_SEPARATOR);
94
95    let file_name_start_idx = trimmed_path
96        .rfind(std::path::MAIN_SEPARATOR)
97        .map(|i| i + 1)
98        .unwrap_or(0);
99    let mut score = 0.0;
100
101    let mut unique_matched_indices = Vec::new();
102
103    // Mutually exclusive bonuses
104    for token in inputs.query_tokens {
105        let t_str = token.as_str();
106
107        let mut is_exact_filename = false;
108        let mut is_exact_stem = false;
109        let mut is_filename_start = false;
110        let mut is_in_filename = false;
111        let mut is_in_path = false;
112        let mut is_exact_word = false;
113        let mut has_any_match = false;
114
115        for (idx, _) in normalized.match_indices(t_str) {
116            has_any_match = true;
117            unique_matched_indices.push(idx); // Track for coverage
118
119            let start_boundary =
120                idx == 0 || !normalized[..idx].chars().last().unwrap().is_alphanumeric();
121            let end_boundary = idx + t_str.len() == normalized.len()
122                || !normalized[idx + t_str.len()..]
123                    .chars()
124                    .next()
125                    .unwrap()
126                    .is_alphanumeric();
127
128            if start_boundary {
129                if idx == file_name_start_idx {
130                    let end_idx = idx + t_str.len();
131                    if end_idx <= trimmed_path.len() {
132                        let remainder = &trimmed_path[end_idx..];
133                        if remainder.is_empty() {
134                            is_exact_filename = true;
135                        } else if remainder.starts_with('.') {
136                            is_exact_stem = true;
137                        } else {
138                            is_filename_start = true;
139                        }
140                    }
141                } else if idx >= file_name_start_idx {
142                    is_in_filename = true;
143                } else {
144                    is_in_path = true;
145                }
146
147                if end_boundary {
148                    is_exact_word = true;
149                }
150            }
151        }
152
153        if is_exact_filename {
154            score += weights.exact_filename_match;
155        } else if is_exact_stem {
156            score += weights.exact_stem_match;
157        } else if is_filename_start {
158            score += weights.filename_prefix_match;
159        } else if is_in_filename {
160            score += weights.filename_match;
161        } else if is_in_path {
162            score += weights.path_prefix_match;
163        } else if has_any_match {
164            score -= weights.midword_penalty;
165        }
166
167        if is_exact_word {
168            score += weights.exact_match;
169        }
170
171        if normalized.ends_with(&format!(".{}", t_str)) {
172            score -= 30.0;
173        }
174    }
175
176    // Token coverage
177    unique_matched_indices.sort_unstable();
178    unique_matched_indices.dedup();
179
180    let path_word_count = trimmed_path
181        .split(|c: char| !c.is_alphanumeric())
182        .filter(|s| !s.is_empty())
183        .count();
184
185    if path_word_count > 0 {
186        let effective_length = (path_word_count as f64).min(8.0);
187        let coverage_ratio = (unique_matched_indices.len() as f64 / effective_length).min(1.0);
188        score += weights.token_coverage * coverage_ratio;
189    }
190
191    // Recency and kind boosting
192    let recent_date = inputs.last_modified.max(inputs.last_accessed);
193    let age_days = (inputs.now_micros - recent_date as f64) / (1_000_000.0 * 86_400.0);
194    score += weights.recency_boost - weights.recency_decay * (1.0 + age_days.max(0.0)).ln();
195
196    score += match inputs.kind {
197        Kind::Directory => weights.kind_dir_boost,
198        Kind::File => weights.kind_file_boost,
199        Kind::Symlink => weights.kind_file_boost * 0.5,
200    };
201
202    // Proximity and ordering
203    if inputs.query_tokens.len() > 1 {
204        let mut min_pos = usize::MAX;
205        let mut max_pos = 0;
206        let mut total_token_len = 0;
207        let mut matched_count = 0;
208
209        for q in inputs.query_tokens {
210            if let Some(pos) = normalized.find(q.as_str()) {
211                min_pos = min_pos.min(pos);
212                max_pos = max_pos.max(pos + q.len());
213                total_token_len += q.len();
214                matched_count += 1;
215            }
216        }
217
218        if matched_count > 1 && max_pos > min_pos {
219            let span = max_pos - min_pos;
220            let density = (total_token_len as f64 / span as f64).min(1.0);
221            score += weights.proximity_bonus * density;
222        }
223    }
224
225    if inputs.raw_query_tokens.len() > 1 {
226        let mut last_pos = 0;
227        let mut is_ordered = true;
228
229        for raw_token in inputs.raw_query_tokens {
230            if let Some(pos) = normalized[last_pos..].find(raw_token) {
231                last_pos += pos + raw_token.len();
232            } else {
233                is_ordered = false;
234                break;
235            }
236        }
237
238        if is_ordered {
239            score += weights.ordering_bonus;
240        }
241    }
242
243    score
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn test_compute_score_basic() {
252        let weights = ScoringWeights::default();
253        let query_tokens = vec!["abc".to_string()];
254        let raw_query_tokens = vec!["abc"];
255        let now = 1_000_000.0;
256        let inputs1 = ScoringInputs {
257            path: "abc.txt",
258            query_tokens: &query_tokens,
259            raw_query_tokens: &raw_query_tokens,
260            last_modified: 1_000_000,
261            last_accessed: 1_000_000,
262            kind: Kind::File,
263            now_micros: now,
264        };
265
266        let inputs2 = ScoringInputs {
267            path: "other.txt",
268            query_tokens: &query_tokens,
269            raw_query_tokens: &raw_query_tokens,
270            last_modified: 1_000_000,
271            last_accessed: 1_000_000,
272            kind: Kind::File,
273            now_micros: now,
274        };
275
276        let score1 = compute_score(&weights, &inputs1);
277        let score2 = compute_score(&weights, &inputs2);
278
279        assert!(score1 > score2);
280    }
281
282    #[test]
283    fn test_compute_score_filename_boost() {
284        let config = ScoringWeights::default();
285        let query_tokens = vec!["abc".to_string()];
286        let raw_query_tokens = vec!["abc"];
287        let now = 1_000_000.0;
288        let sep = std::path::MAIN_SEPARATOR_STR;
289
290        // "abc" is in the filename vs in the directory path
291        let score1 = compute_score(
292            &config,
293            &ScoringInputs {
294                path: &format!("{}foo{}abc{}file.txt", sep, sep, sep),
295                query_tokens: &query_tokens,
296                raw_query_tokens: &raw_query_tokens,
297                last_modified: 1_000_000,
298                last_accessed: 1_000_000,
299                kind: Kind::File,
300                now_micros: now,
301            },
302        );
303        let score2 = compute_score(
304            &config,
305            &ScoringInputs {
306                path: &format!("{}foo{}bar{}abc.txt", sep, sep, sep),
307                query_tokens: &query_tokens,
308                raw_query_tokens: &raw_query_tokens,
309                last_modified: 1_000_000,
310                last_accessed: 1_000_000,
311                kind: Kind::File,
312                now_micros: now,
313            },
314        );
315
316        // score2 should have a higher boost since "abc" matches the filename "abc.txt"
317        assert!(score2 > score1);
318    }
319
320    #[test]
321    fn test_compute_score_depth_penalty() {
322        let config = ScoringWeights::default();
323        let query_tokens = vec!["abc".to_string()];
324        let raw_query_tokens = vec!["abc"];
325        let now = 1_000_000.0;
326
327        let sep = std::path::MAIN_SEPARATOR;
328        let path1 = format!("{}abc.txt", sep);
329        let path2 = format!("{}foo{}bar{}baz{}abc.txt", sep, sep, sep, sep);
330
331        let score1 = compute_score(
332            &config,
333            &ScoringInputs {
334                path: &path1,
335                query_tokens: &query_tokens,
336                raw_query_tokens: &raw_query_tokens,
337                last_modified: 1_000_000,
338                last_accessed: 1_000_000,
339                kind: Kind::File,
340                now_micros: now,
341            },
342        );
343        let score2 = compute_score(
344            &config,
345            &ScoringInputs {
346                path: &path2,
347                query_tokens: &query_tokens,
348                raw_query_tokens: &raw_query_tokens,
349                last_modified: 1_000_000,
350                last_accessed: 1_000_000,
351                kind: Kind::File,
352                now_micros: now,
353            },
354        );
355
356        assert!(score1 > score2); // Shallow result should be higher
357    }
358
359    #[test]
360    fn test_compute_score_recency() {
361        let config = ScoringWeights::default();
362        let query_tokens = vec!["abc".to_string()];
363        let raw_query_tokens = vec!["abc"];
364        let now = 2_000_000_000_000.0; // Big "now"
365
366        let score_recent = compute_score(
367            &config,
368            &ScoringInputs {
369                path: "abc.txt",
370                query_tokens: &query_tokens,
371                raw_query_tokens: &raw_query_tokens,
372                last_modified: 1_900_000_000_000,
373                last_accessed: 1_900_000_000_000,
374                kind: Kind::File,
375                now_micros: now,
376            },
377        );
378        let score_old = compute_score(
379            &config,
380            &ScoringInputs {
381                path: "abc.txt",
382                query_tokens: &query_tokens,
383                raw_query_tokens: &raw_query_tokens,
384                last_modified: 1_000_000_000_000,
385                last_accessed: 1_000_000_000_000,
386                kind: Kind::File,
387                now_micros: now,
388            },
389        );
390
391        assert!(score_recent > score_old);
392    }
393
394    #[test]
395    fn test_compute_score_ordering() {
396        let config = ScoringWeights::default();
397        let query_tokens = vec!["foo".to_string(), "bar".to_string()];
398        let raw_query_tokens = vec!["foo", "bar"];
399        let now = 1_000_000.0;
400
401        let score_ordered = compute_score(
402            &config,
403            &ScoringInputs {
404                path: "foo_bar.txt",
405                query_tokens: &query_tokens,
406                raw_query_tokens: &raw_query_tokens,
407                last_modified: 1_000_000,
408                last_accessed: 1_000_000,
409                kind: Kind::File,
410                now_micros: now,
411            },
412        );
413        let score_unordered = compute_score(
414            &config,
415            &ScoringInputs {
416                path: "bar_foo.txt",
417                query_tokens: &query_tokens,
418                raw_query_tokens: &raw_query_tokens,
419                last_modified: 1_000_000,
420                last_accessed: 1_000_000,
421                kind: Kind::File,
422                now_micros: now,
423            },
424        );
425
426        assert!(score_ordered > score_unordered);
427    }
428
429    #[test]
430    #[cfg(windows)]
431    fn test_compute_score_windows_paths() {
432        let weights = ScoringWeights::default();
433        let query_tokens = vec!["report".to_string()];
434        let raw_query_tokens = vec!["report"];
435        let now = 1_000_000.0;
436
437        // Drive letter filename match
438        let score1 = compute_score(
439            &weights,
440            &ScoringInputs {
441                path: "C:\\Users\\joao\\report.pdf",
442                query_tokens: &query_tokens,
443                raw_query_tokens: &raw_query_tokens,
444                last_modified: 1_000_000,
445                last_accessed: 1_000_000,
446                kind: Kind::File,
447                now_micros: now,
448            },
449        );
450
451        // UNC path filename match
452        let score2 = compute_score(
453            &weights,
454            &ScoringInputs {
455                path: "\\\\?\\D:\\Backup\\report.pdf",
456                query_tokens: &query_tokens,
457                raw_query_tokens: &raw_query_tokens,
458                last_modified: 1_000_000,
459                last_accessed: 1_000_000,
460                kind: Kind::File,
461                now_micros: now,
462            },
463        );
464
465        // Server share filename match
466        let score3 = compute_score(
467            &weights,
468            &ScoringInputs {
469                path: "\\\\server\\share\\finance\\report.pdf",
470                query_tokens: &query_tokens,
471                raw_query_tokens: &raw_query_tokens,
472                last_modified: 1_000_000,
473                last_accessed: 1_000_000,
474                kind: Kind::File,
475                now_micros: now,
476            },
477        );
478
479        // All should have a boost for "report" being the filename
480        assert!(score1 > 50.0);
481        assert!(score2 > 50.0);
482        assert!(score3 > 50.0);
483    }
484}
minidex/search/scoring.rs

minidex/search/
scoring.rs