use crate::posting::Bm25Params;
pub fn bm25_score(
tf: u32,
df: u32,
doc_len: u32,
total_docs: u32,
avg_doc_len: f32,
params: &Bm25Params,
) -> f32 {
let tf_f = tf as f32;
let df_f = df as f32;
let n = total_docs as f32;
let dl = doc_len as f32;
let idf = ((n - df_f + 0.5) / (df_f + 0.5) + 1.0).ln();
let tf_norm = (tf_f * (params.k1 + 1.0))
/ (tf_f + params.k1 * (1.0 - params.b + params.b * dl / avg_doc_len));
idf * tf_norm
}
pub fn bm25_block_upper_bound(
block_max_tf: u32,
block_min_fieldnorm: u8,
df: u32,
total_docs: u32,
avg_doc_len: f32,
params: &Bm25Params,
) -> f32 {
let min_doc_len = crate::codec::smallfloat::decode(block_min_fieldnorm).max(1);
bm25_score(
block_max_tf,
df,
min_doc_len,
total_docs,
avg_doc_len,
params,
)
}
pub fn idf(df: u32, total_docs: u32) -> f32 {
let df_f = df as f32;
let n = total_docs as f32;
((n - df_f + 0.5) / (df_f + 0.5) + 1.0).ln()
}
pub fn term_max_score(
global_max_tf: u32,
global_min_fieldnorm: u8,
df: u32,
total_docs: u32,
avg_doc_len: f32,
params: &Bm25Params,
) -> f32 {
bm25_block_upper_bound(
global_max_tf,
global_min_fieldnorm,
df,
total_docs,
avg_doc_len,
params,
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bm25_basic() {
let params = Bm25Params::default();
let score = bm25_score(2, 5, 100, 1000, 120.0, ¶ms);
assert!(score > 0.0, "BM25 score should be positive");
}
#[test]
fn bm25_rare_term_scores_higher() {
let params = Bm25Params::default();
let common = bm25_score(1, 500, 100, 1000, 100.0, ¶ms);
let rare = bm25_score(1, 5, 100, 1000, 100.0, ¶ms);
assert!(
rare > common,
"rare term should score higher than common term"
);
}
#[test]
fn bm25_higher_tf_scores_higher() {
let params = Bm25Params::default();
let low_tf = bm25_score(1, 10, 100, 1000, 100.0, ¶ms);
let high_tf = bm25_score(5, 10, 100, 1000, 100.0, ¶ms);
assert!(high_tf > low_tf, "higher TF should score higher");
}
#[test]
fn bm25_shorter_doc_scores_higher() {
let params = Bm25Params::default();
let short = bm25_score(1, 10, 50, 1000, 100.0, ¶ms);
let long = bm25_score(1, 10, 200, 1000, 100.0, ¶ms);
assert!(short > long, "shorter doc should score higher for same TF");
}
#[test]
fn block_upper_bound_is_upper() {
let params = Bm25Params::default();
let upper = bm25_block_upper_bound(
5,
crate::codec::smallfloat::encode(50),
10,
1000,
100.0,
¶ms,
);
let actual = bm25_score(3, 10, 100, 1000, 100.0, ¶ms);
assert!(upper >= actual, "upper bound {upper} < actual {actual}");
}
#[test]
fn bm25_tf_saturation() {
let params = Bm25Params::default();
let tf10 = bm25_score(10, 10, 100, 1000, 100.0, ¶ms);
let tf100 = bm25_score(100, 10, 100, 1000, 100.0, ¶ms);
assert!(tf100 > tf10);
assert!(
tf100 / tf10 < 2.0,
"TF saturation should limit score growth"
);
}
}