use std::path::{Path, PathBuf};
use crate::search::tokenizer;
use super::{Bm25Index, Document};
impl Bm25Index {
pub(super) fn index_file(&mut self, file: &Path) {
let content = match std::fs::read_to_string(file) {
Ok(c) => c,
Err(_) => return,
};
let rel_path = file
.strip_prefix(&self.root)
.unwrap_or(file)
.to_string_lossy()
.to_string();
let mut tf = tokenizer::tokenize_code(&content);
for token in tokenizer::tokenize_path(&rel_path) {
*tf.entry(token).or_default() += 1;
}
let token_count: u32 = tf.values().sum();
self.total_tokens += token_count as u64;
for key in tf.keys() {
*self.doc_freq.entry(key.clone()).or_default() += 1;
}
self.documents.insert(
file.to_path_buf(),
Document {
rel_path,
abs_path: file.to_path_buf(),
tf,
token_count,
},
);
}
pub(super) fn remove_file(&mut self, file: &Path) {
if let Some(old_doc) = self.documents.remove(file) {
self.total_tokens = self.total_tokens.saturating_sub(old_doc.token_count as u64);
for key in old_doc.tf.keys() {
if let Some(count) = self.doc_freq.get_mut(key) {
*count = count.saturating_sub(1);
if *count == 0 {
self.doc_freq.remove(key);
}
}
}
}
}
pub(super) fn rank(&self, query: &str, max_results: usize) -> Vec<(String, PathBuf, f64)> {
use crate::search::scoring::{Bm25Params, bm25_score};
const PARAMS: Bm25Params = Bm25Params { k1: 1.2, b: 0.5 };
if self.documents.is_empty() {
return Vec::new();
}
let query_tokens = tokenizer::tokenize_query(query);
if query_tokens.is_empty() {
return Vec::new();
}
let avg_dl = self.avg_doc_len();
let n = self.documents.len();
let mut scores: Vec<(&Document, f64)> = self
.documents
.values()
.filter_map(|doc| {
let score = bm25_score(
&doc.tf,
doc.token_count,
&query_tokens,
&self.doc_freq,
n,
avg_dl,
&PARAMS,
);
if score > 0.0 {
Some((doc, score))
} else {
None
}
})
.collect();
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
scores.truncate(max_results);
scores
.into_iter()
.map(|(doc, score)| (doc.rel_path.clone(), doc.abs_path.clone(), score))
.collect()
}
}