manx_cli/rag/providers/
hash.rs1use anyhow::Result;
2use async_trait::async_trait;
3use std::collections::hash_map::DefaultHasher;
4use std::hash::{Hash, Hasher};
5
6use super::{EmbeddingProvider as ProviderTrait, ProviderInfo};
7use crate::rag::embeddings::preprocessing;
8
9pub struct HashProvider {
11 dimension: usize,
12}
13
14impl HashProvider {
15 pub fn new(dimension: usize) -> Self {
16 Self { dimension }
17 }
18}
19
20#[async_trait]
21impl ProviderTrait for HashProvider {
22 async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
23 if text.trim().is_empty() {
24 return Err(anyhow::anyhow!("Cannot embed empty text"));
25 }
26
27 let embedding = self.generate_hash_embedding(text);
28 Ok(embedding)
29 }
30
31 async fn get_dimension(&self) -> Result<usize> {
32 Ok(self.dimension)
33 }
34
35 async fn health_check(&self) -> Result<()> {
36 Ok(())
38 }
39
40 fn get_info(&self) -> ProviderInfo {
41 ProviderInfo {
42 name: "Hash-based Embeddings".to_string(),
43 provider_type: "hash".to_string(),
44 model_name: None,
45 description: "Fast hash-based embeddings for basic semantic similarity".to_string(),
46 max_input_length: Some(2048),
47 }
48 }
49}
50
51impl HashProvider {
52 fn generate_hash_embedding(&self, text: &str) -> Vec<f32> {
54 let cleaned_text = preprocessing::clean_text(text);
55 let words: Vec<&str> = cleaned_text.split_whitespace().collect();
56
57 let mut embedding = vec![0.0; self.dimension];
58
59 for word in &words {
61 let mut hasher = DefaultHasher::new();
62 word.to_lowercase().hash(&mut hasher);
63 let hash = hasher.finish();
64
65 for i in 0..self.dimension {
67 let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
68 embedding[feature_hash] += 1.0;
69 }
70 }
71
72 for window in words.windows(2) {
74 let bigram = format!("{} {}", window[0], window[1]);
75 let mut hasher = DefaultHasher::new();
76 bigram.to_lowercase().hash(&mut hasher);
77 let hash = hasher.finish();
78
79 for i in 0..self.dimension {
80 let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
81 embedding[feature_hash] += 0.5; }
83 }
84
85 let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
87 if norm > 0.0 {
88 for val in &mut embedding {
89 *val /= norm;
90 }
91 }
92
93 embedding
94 }
95}