pub fn bm25_score(
content: &str,
keywords: &[String],
query_terms: &[String],
avgdl: f64,
k1: f64,
b: f64,
) -> f64 {
if query_terms.is_empty() || avgdl <= 0.0 {
return 0.0;
}
let lower_content = content.to_lowercase();
let content_words: Vec<String> = lower_content.split_whitespace().map(String::from).collect();
let lower_keywords: Vec<String> = keywords.iter().map(|k| k.to_lowercase()).collect();
bm25_score_pre(&content_words, &lower_keywords, query_terms, avgdl, k1, b)
}
pub fn bm25_score_pre(
content_words: &[String],
lower_keywords: &[String],
query_terms: &[String],
avgdl: f64,
k1: f64,
b: f64,
) -> f64 {
if query_terms.is_empty() || avgdl <= 0.0 {
return 0.0;
}
let dl = content_words.len() as f64;
let norm = 1.0 - b + b * (dl / avgdl);
let denom_factor = k1 * norm;
let mut score = 0.0;
for term in query_terms {
let term_str = term.as_str();
let tf_content = content_words
.iter()
.filter(|w| w.as_str().contains(term_str))
.count() as f64;
let keyword_bonus = if lower_keywords.iter().any(|k| k.contains(term_str)) {
2.0
} else {
0.0
};
let tf = tf_content + keyword_bonus;
if tf <= 0.0 {
continue;
}
score += (tf * (k1 + 1.0)) / (tf + denom_factor);
}
score
}
pub const DEFAULT_K1: f64 = 1.2;
pub const DEFAULT_B: f64 = 0.75;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bm25_empty_query_returns_zero() {
let score = bm25_score("some content", &[], &[], 10.0, DEFAULT_K1, DEFAULT_B);
assert!((score - 0.0).abs() < f64::EPSILON);
}
#[test]
fn bm25_no_match_returns_zero() {
let terms = vec!["python".into()];
let score = bm25_score("rust is fast", &[], &terms, 10.0, DEFAULT_K1, DEFAULT_B);
assert!((score - 0.0).abs() < f64::EPSILON);
}
#[test]
fn bm25_single_term_match() {
let terms = vec!["rust".into()];
let score = bm25_score("rust is fast", &[], &terms, 10.0, DEFAULT_K1, DEFAULT_B);
assert!(score > 0.0);
}
#[test]
fn bm25_multiple_term_matches_score_higher() {
let terms = vec!["rust".into(), "fast".into()];
let score_both = bm25_score("rust is fast", &[], &terms, 10.0, DEFAULT_K1, DEFAULT_B);
let terms_one = vec!["rust".into()];
let score_one = bm25_score("rust is fast", &[], &terms_one, 10.0, DEFAULT_K1, DEFAULT_B);
assert!(
score_both > score_one,
"matching more terms should score higher"
);
}
#[test]
fn bm25_keyword_field_boosts_score() {
let terms = vec!["performance".into()];
let score_content = bm25_score(
"rust has good performance",
&[],
&terms,
10.0,
DEFAULT_K1,
DEFAULT_B,
);
let score_keywords = bm25_score(
"rust is great",
&["performance".into()],
&terms,
10.0,
DEFAULT_K1,
DEFAULT_B,
);
assert!(score_content > 0.0);
assert!(score_keywords > 0.0);
}
#[test]
fn bm25_shorter_docs_score_higher() {
let terms = vec!["rust".into()];
let score_short = bm25_score("rust is fast", &[], &terms, 10.0, DEFAULT_K1, DEFAULT_B);
let score_long = bm25_score(
"rust is a programming language that is very very fast",
&[],
&terms,
10.0,
DEFAULT_K1,
DEFAULT_B,
);
assert!(
score_short > score_long,
"shorter doc with same matches should score higher (length normalization)"
);
}
#[test]
fn bm25_zero_avgdl_returns_zero() {
let terms = vec!["rust".into()];
let score = bm25_score("rust is fast", &[], &terms, 0.0, DEFAULT_K1, DEFAULT_B);
assert!((score - 0.0).abs() < f64::EPSILON);
}
#[test]
fn bm25_repeated_terms_in_content_boost() {
let terms = vec!["rust".into()];
let score_once = bm25_score("rust is great", &[], &terms, 10.0, DEFAULT_K1, DEFAULT_B);
let score_twice = bm25_score(
"rust rust is great",
&[],
&terms,
10.0,
DEFAULT_K1,
DEFAULT_B,
);
assert!(
score_twice > score_once,
"higher TF should yield higher score"
);
}
}