Skip to main content

manx_cli/rag/providers/
hash.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use std::collections::hash_map::DefaultHasher;
4use std::hash::{Hash, Hasher};
5
6use super::{EmbeddingProvider as ProviderTrait, ProviderInfo};
7use crate::rag::embeddings::preprocessing;
8
9/// Hash-based embedding provider (fast, lightweight, no dependencies)
10pub struct HashProvider {
11    dimension: usize,
12}
13
14impl HashProvider {
15    pub fn new(dimension: usize) -> Self {
16        Self { dimension }
17    }
18}
19
20#[async_trait]
21impl ProviderTrait for HashProvider {
22    async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
23        if text.trim().is_empty() {
24            return Err(anyhow::anyhow!("Cannot embed empty text"));
25        }
26
27        let embedding = self.generate_hash_embedding(text);
28        Ok(embedding)
29    }
30
31    async fn get_dimension(&self) -> Result<usize> {
32        Ok(self.dimension)
33    }
34
35    async fn health_check(&self) -> Result<()> {
36        // Hash provider is always available
37        Ok(())
38    }
39
40    fn get_info(&self) -> ProviderInfo {
41        ProviderInfo {
42            name: "Hash-based Embeddings".to_string(),
43            provider_type: "hash".to_string(),
44            model_name: None,
45            description: "Fast hash-based embeddings for basic semantic similarity".to_string(),
46            max_input_length: Some(2048),
47        }
48    }
49
50    fn as_any(&self) -> &dyn std::any::Any {
51        self
52    }
53}
54
55impl HashProvider {
56    /// Generate hash-based embedding vector
57    fn generate_hash_embedding(&self, text: &str) -> Vec<f32> {
58        let cleaned_text = preprocessing::clean_text(text);
59        let words: Vec<&str> = cleaned_text.split_whitespace().collect();
60
61        let mut embedding = vec![0.0; self.dimension];
62
63        // Generate hash-based features for each word
64        for word in &words {
65            let mut hasher = DefaultHasher::new();
66            word.to_lowercase().hash(&mut hasher);
67            let hash = hasher.finish();
68
69            // Distribute hash across embedding dimensions
70            for i in 0..self.dimension {
71                let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
72                embedding[feature_hash] += 1.0;
73            }
74        }
75
76        // Add n-gram features for better similarity
77        for window in words.windows(2) {
78            let bigram = format!("{} {}", window[0], window[1]);
79            let mut hasher = DefaultHasher::new();
80            bigram.to_lowercase().hash(&mut hasher);
81            let hash = hasher.finish();
82
83            for i in 0..self.dimension {
84                let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
85                embedding[feature_hash] += 0.5; // Lower weight for n-grams
86            }
87        }
88
89        // Normalize the embedding vector
90        let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
91        if norm > 0.0 {
92            for val in &mut embedding {
93                *val /= norm;
94            }
95        }
96
97        embedding
98    }
99}