1use crate::embed::{EmbedKind, Embedder};
9use crate::text::{fnv1a_32, tokenize};
10
11pub struct BowEmbedder {
12 dim: usize,
13}
14
15impl BowEmbedder {
16 pub fn new() -> Self {
17 Self { dim: 256 }
18 }
19
20 fn one(&self, text: &str) -> Vec<f32> {
21 let mut v = vec![0f32; self.dim];
22 for tok in tokenize(text) {
23 let idx = (fnv1a_32(&tok) as usize) % self.dim;
24 v[idx] += 1.0;
25 }
26 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
27 if norm > 0.0 {
28 for x in &mut v {
29 *x /= norm;
30 }
31 }
32 v
33 }
34}
35
36impl Default for BowEmbedder {
37 fn default() -> Self {
38 Self::new()
39 }
40}
41
42impl Embedder for BowEmbedder {
43 fn id(&self) -> String {
44 format!("bow-{}-v1", self.dim)
45 }
46
47 fn embed(&self, texts: &[String], _kind: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>> {
48 Ok(texts.iter().map(|t| self.one(t)).collect())
49 }
50}
51
52#[cfg(test)]
53mod tests {
54 use super::*;
55
56 #[test]
57 fn deterministic_and_normalized() {
58 let e = BowEmbedder::new();
59 let a = &e
60 .embed(&["commit this diff".into()], EmbedKind::Query)
61 .unwrap()[0];
62 let b = &e
63 .embed(&["commit this diff".into()], EmbedKind::Document)
64 .unwrap()[0];
65 assert_eq!(a, b);
66 let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
67 assert!((norm - 1.0).abs() < 1e-5);
68 }
69
70 #[test]
71 fn overlap_scores_higher_than_disjoint() {
72 let e = BowEmbedder::new();
73 let q = &e
74 .embed(&["python project setup".into()], EmbedKind::Query)
75 .unwrap()[0];
76 let near = &e
77 .embed(&["set up a python project".into()], EmbedKind::Document)
78 .unwrap()[0];
79 let far = &e
80 .embed(&["lemonade server gpu".into()], EmbedKind::Document)
81 .unwrap()[0];
82 let cos = |a: &[f32], b: &[f32]| a.iter().zip(b).map(|(x, y)| x * y).sum::<f32>();
83 assert!(cos(q, near) > cos(q, far));
84 }
85}