manx_cli/rag/providers/
hash.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use std::collections::hash_map::DefaultHasher;
4use std::hash::{Hash, Hasher};
5
6use super::{EmbeddingProvider as ProviderTrait, ProviderInfo};
7use crate::rag::embeddings::preprocessing;
8
9/// Hash-based embedding provider (fast, lightweight, no dependencies)
10pub struct HashProvider {
11    dimension: usize,
12}
13
14impl HashProvider {
15    pub fn new(dimension: usize) -> Self {
16        Self { dimension }
17    }
18}
19
20#[async_trait]
21impl ProviderTrait for HashProvider {
22    async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
23        if text.trim().is_empty() {
24            return Err(anyhow::anyhow!("Cannot embed empty text"));
25        }
26
27        let embedding = self.generate_hash_embedding(text);
28        Ok(embedding)
29    }
30
31    async fn get_dimension(&self) -> Result<usize> {
32        Ok(self.dimension)
33    }
34
35    async fn health_check(&self) -> Result<()> {
36        // Hash provider is always available
37        Ok(())
38    }
39
40    fn get_info(&self) -> ProviderInfo {
41        ProviderInfo {
42            name: "Hash-based Embeddings".to_string(),
43            provider_type: "hash".to_string(),
44            model_name: None,
45            description: "Fast hash-based embeddings for basic semantic similarity".to_string(),
46            max_input_length: Some(2048),
47        }
48    }
49}
50
51impl HashProvider {
52    /// Generate hash-based embedding vector
53    fn generate_hash_embedding(&self, text: &str) -> Vec<f32> {
54        let cleaned_text = preprocessing::clean_text(text);
55        let words: Vec<&str> = cleaned_text.split_whitespace().collect();
56
57        let mut embedding = vec![0.0; self.dimension];
58
59        // Generate hash-based features for each word
60        for word in &words {
61            let mut hasher = DefaultHasher::new();
62            word.to_lowercase().hash(&mut hasher);
63            let hash = hasher.finish();
64
65            // Distribute hash across embedding dimensions
66            for i in 0..self.dimension {
67                let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
68                embedding[feature_hash] += 1.0;
69            }
70        }
71
72        // Add n-gram features for better similarity
73        for window in words.windows(2) {
74            let bigram = format!("{} {}", window[0], window[1]);
75            let mut hasher = DefaultHasher::new();
76            bigram.to_lowercase().hash(&mut hasher);
77            let hash = hasher.finish();
78
79            for i in 0..self.dimension {
80                let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
81                embedding[feature_hash] += 0.5; // Lower weight for n-grams
82            }
83        }
84
85        // Normalize the embedding vector
86        let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
87        if norm > 0.0 {
88            for val in &mut embedding {
89                *val /= norm;
90            }
91        }
92
93        embedding
94    }
95}