manx_cli/rag/providers/
hash.rs1use anyhow::Result;
2use async_trait::async_trait;
3use std::collections::hash_map::DefaultHasher;
4use std::hash::{Hash, Hasher};
5
6use super::{EmbeddingProvider as ProviderTrait, ProviderInfo};
7use crate::rag::embeddings::preprocessing;
8
9pub struct HashProvider {
11 dimension: usize,
12}
13
14impl HashProvider {
15 pub fn new(dimension: usize) -> Self {
16 Self { dimension }
17 }
18}
19
20#[async_trait]
21impl ProviderTrait for HashProvider {
22 async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
23 if text.trim().is_empty() {
24 return Err(anyhow::anyhow!("Cannot embed empty text"));
25 }
26
27 let embedding = self.generate_hash_embedding(text);
28 Ok(embedding)
29 }
30
31 async fn get_dimension(&self) -> Result<usize> {
32 Ok(self.dimension)
33 }
34
35 async fn health_check(&self) -> Result<()> {
36 Ok(())
38 }
39
40 fn get_info(&self) -> ProviderInfo {
41 ProviderInfo {
42 name: "Hash-based Embeddings".to_string(),
43 provider_type: "hash".to_string(),
44 model_name: None,
45 description: "Fast hash-based embeddings for basic semantic similarity".to_string(),
46 max_input_length: Some(2048),
47 }
48 }
49
50 fn as_any(&self) -> &dyn std::any::Any {
51 self
52 }
53}
54
55impl HashProvider {
56 fn generate_hash_embedding(&self, text: &str) -> Vec<f32> {
58 let cleaned_text = preprocessing::clean_text(text);
59 let words: Vec<&str> = cleaned_text.split_whitespace().collect();
60
61 let mut embedding = vec![0.0; self.dimension];
62
63 for word in &words {
65 let mut hasher = DefaultHasher::new();
66 word.to_lowercase().hash(&mut hasher);
67 let hash = hasher.finish();
68
69 for i in 0..self.dimension {
71 let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
72 embedding[feature_hash] += 1.0;
73 }
74 }
75
76 for window in words.windows(2) {
78 let bigram = format!("{} {}", window[0], window[1]);
79 let mut hasher = DefaultHasher::new();
80 bigram.to_lowercase().hash(&mut hasher);
81 let hash = hasher.finish();
82
83 for i in 0..self.dimension {
84 let feature_hash = (hash.wrapping_mul(i as u64 + 1)) as usize % self.dimension;
85 embedding[feature_hash] += 0.5; }
87 }
88
89 let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
91 if norm > 0.0 {
92 for val in &mut embedding {
93 *val /= norm;
94 }
95 }
96
97 embedding
98 }
99}