Skip to main content

ski/embed/
bow.rs

1//! Deterministic hashed bag-of-words embedder.
2//!
3//! Not semantic (no synonyms) — surface-token overlap only. It exists so the
4//! whole pipeline builds, runs, and tests with zero network/model, and as a
5//! fallback on machines without the fastembed feature. The hybrid keyword boost
6//! in the ranker compensates for its lack of semantics on exact terms.
7
8use crate::embed::{EmbedKind, Embedder};
9use crate::text::{fnv1a_32, tokenize};
10
11pub struct BowEmbedder {
12    dim: usize,
13}
14
15impl BowEmbedder {
16    pub fn new() -> Self {
17        Self { dim: 256 }
18    }
19
20    fn one(&self, text: &str) -> Vec<f32> {
21        let mut v = vec![0f32; self.dim];
22        for tok in tokenize(text) {
23            let idx = (fnv1a_32(&tok) as usize) % self.dim;
24            v[idx] += 1.0;
25        }
26        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
27        if norm > 0.0 {
28            for x in &mut v {
29                *x /= norm;
30            }
31        }
32        v
33    }
34}
35
36impl Default for BowEmbedder {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl Embedder for BowEmbedder {
43    fn id(&self) -> String {
44        format!("bow-{}-v1", self.dim)
45    }
46
47    fn embed(&self, texts: &[String], _kind: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>> {
48        Ok(texts.iter().map(|t| self.one(t)).collect())
49    }
50}
51
52#[cfg(test)]
53mod tests {
54    use super::*;
55
56    #[test]
57    fn deterministic_and_normalized() {
58        let e = BowEmbedder::new();
59        let a = &e
60            .embed(&["commit this diff".into()], EmbedKind::Query)
61            .unwrap()[0];
62        let b = &e
63            .embed(&["commit this diff".into()], EmbedKind::Document)
64            .unwrap()[0];
65        assert_eq!(a, b);
66        let norm: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
67        assert!((norm - 1.0).abs() < 1e-5);
68    }
69
70    #[test]
71    fn overlap_scores_higher_than_disjoint() {
72        let e = BowEmbedder::new();
73        let q = &e
74            .embed(&["python project setup".into()], EmbedKind::Query)
75            .unwrap()[0];
76        let near = &e
77            .embed(&["set up a python project".into()], EmbedKind::Document)
78            .unwrap()[0];
79        let far = &e
80            .embed(&["lemonade server gpu".into()], EmbedKind::Document)
81            .unwrap()[0];
82        let cos = |a: &[f32], b: &[f32]| a.iter().zip(b).map(|(x, y)| x * y).sum::<f32>();
83        assert!(cos(q, near) > cos(q, far));
84    }
85}