post_cortex_embeddings/embeddings/backends/static_hash.rs
1// Copyright (c) 2025 Julius ML
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in all
11// copies or substantial portions of the Software.
12
13//! Hash-based static embedding fallback. Used by \[`EmbeddingModelType::StaticSimilarityMRL`\].
14//!
15//! **Warning**: this is a synthetic embedding — every text is mapped to a
16//! deterministic, pseudo-random unit vector keyed by its hash. Semantically
17//! similar inputs receive completely unrelated vectors. Useful only for tests
18//! and for keeping the engine functional when BERT loading is unavailable.
19
20use anyhow::Result;
21use async_trait::async_trait;
22use std::collections::hash_map::DefaultHasher;
23use std::hash::{Hash, Hasher};
24use std::sync::Arc;
25
26use crate::embeddings::backend::EmbeddingBackend;
27use crate::embeddings::pool::MemoryPool;
28
29/// Fallback embedding backend — hashes the input bytes into a
30/// deterministic pseudo-embedding of the configured dimension.
31///
32/// Used when the BERT backend is unavailable (e.g. the `bert` feature
33/// is disabled or the HuggingFace Hub model cache cannot be reached).
34/// The output is **not** semantically meaningful — it just provides
35/// stable vectors for tests and offline development.
36pub struct StaticHashBackend {
37 dimension: usize,
38 memory_pool: Arc<MemoryPool>,
39}
40
41impl StaticHashBackend {
42 pub(in crate::embeddings) fn new(dimension: usize, memory_pool: Arc<MemoryPool>) -> Self {
43 Self {
44 dimension,
45 memory_pool,
46 }
47 }
48
49 fn text_hash(text: &str) -> u64 {
50 let mut hasher = DefaultHasher::new();
51 text.hash(&mut hasher);
52 hasher.finish()
53 }
54}
55
56#[async_trait]
57impl EmbeddingBackend for StaticHashBackend {
58 fn embedding_dimension(&self) -> usize {
59 self.dimension
60 }
61
62 fn is_bert_based(&self) -> bool {
63 false
64 }
65
66 async fn process_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
67 let mut results = Vec::with_capacity(texts.len());
68
69 for text in &texts {
70 let mut embedding = self.memory_pool.get_or_allocate();
71 let hash = Self::text_hash(text);
72 embedding.clear();
73 embedding.reserve(self.dimension);
74
75 for i in 0..self.dimension {
76 let value = ((hash.wrapping_add(i as u64)) as f32 / u64::MAX as f32) * 2.0 - 1.0;
77 embedding.push(value);
78 }
79
80 // Normalize to unit length
81 let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
82 if norm > 0.0 {
83 for val in &mut embedding {
84 *val /= norm;
85 }
86 }
87
88 results.push(embedding);
89 }
90
91 Ok(results)
92 }
93}