Skip to main content

post_cortex_embeddings/embeddings/backends/
static_hash.rs

1// Copyright (c) 2025 Julius ML
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in all
11// copies or substantial portions of the Software.
12
13//! Hash-based static embedding fallback. Used by \[`EmbeddingModelType::StaticSimilarityMRL`\].
14//!
15//! **Warning**: this is a synthetic embedding — every text is mapped to a
16//! deterministic, pseudo-random unit vector keyed by its hash. Semantically
17//! similar inputs receive completely unrelated vectors. Useful only for tests
18//! and for keeping the engine functional when BERT loading is unavailable.
19
20use anyhow::Result;
21use async_trait::async_trait;
22use std::collections::hash_map::DefaultHasher;
23use std::hash::{Hash, Hasher};
24use std::sync::Arc;
25
26use crate::embeddings::backend::EmbeddingBackend;
27use crate::embeddings::pool::MemoryPool;
28
29/// Fallback embedding backend — hashes the input bytes into a
30/// deterministic pseudo-embedding of the configured dimension.
31///
32/// Used when the BERT backend is unavailable (e.g. the `bert` feature
33/// is disabled or the HuggingFace Hub model cache cannot be reached).
34/// The output is **not** semantically meaningful — it just provides
35/// stable vectors for tests and offline development.
36pub struct StaticHashBackend {
37    dimension: usize,
38    memory_pool: Arc<MemoryPool>,
39}
40
41impl StaticHashBackend {
42    pub(in crate::embeddings) fn new(dimension: usize, memory_pool: Arc<MemoryPool>) -> Self {
43        Self {
44            dimension,
45            memory_pool,
46        }
47    }
48
49    fn text_hash(text: &str) -> u64 {
50        let mut hasher = DefaultHasher::new();
51        text.hash(&mut hasher);
52        hasher.finish()
53    }
54}
55
56#[async_trait]
57impl EmbeddingBackend for StaticHashBackend {
58    fn embedding_dimension(&self) -> usize {
59        self.dimension
60    }
61
62    fn is_bert_based(&self) -> bool {
63        false
64    }
65
66    async fn process_batch(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
67        let mut results = Vec::with_capacity(texts.len());
68
69        for text in &texts {
70            let mut embedding = self.memory_pool.get_or_allocate();
71            let hash = Self::text_hash(text);
72            embedding.clear();
73            embedding.reserve(self.dimension);
74
75            for i in 0..self.dimension {
76                let value = ((hash.wrapping_add(i as u64)) as f32 / u64::MAX as f32) * 2.0 - 1.0;
77                embedding.push(value);
78            }
79
80            // Normalize to unit length
81            let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
82            if norm > 0.0 {
83                for val in &mut embedding {
84                    *val /= norm;
85                }
86            }
87
88            results.push(embedding);
89        }
90
91        Ok(results)
92    }
93}