use anyhow::Result;
use leann_core::embedding::EmbeddingProvider;
use ndarray::Array2;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
pub struct FakeEmbeddingProvider {
pub dims: usize,
}
impl FakeEmbeddingProvider {
pub fn new(dims: usize) -> Self {
Self { dims }
}
fn text_to_vector(&self, text: &str) -> Vec<f32> {
let bytes = text.as_bytes();
let mut vec = vec![0.0f32; self.dims];
for (i, &b) in bytes.iter().enumerate() {
vec[i % self.dims] += b as f32 / 255.0;
}
let norm: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for v in &mut vec {
*v /= norm;
}
}
vec
}
}
impl EmbeddingProvider for FakeEmbeddingProvider {
fn compute_embeddings(
&self,
chunks: &[String],
_progress: Option<&dyn leann_core::hnsw::IndexProgress>,
) -> Result<Array2<f32>> {
let mut data = Vec::with_capacity(chunks.len() * self.dims);
for chunk in chunks {
data.extend(self.text_to_vector(chunk));
}
Ok(Array2::from_shape_vec((chunks.len(), self.dims), data)?)
}
fn dimensions(&self) -> usize {
self.dims
}
fn name(&self) -> &str {
"fake-test-provider"
}
}
pub fn sample_documents(n: usize) -> Vec<(String, HashMap<String, serde_json::Value>)> {
(0..n)
.map(|i| {
let topic = format!("topic_{}", i % 5);
let text = format!("This is document {} about {}", i, topic);
let mut meta = HashMap::new();
meta.insert("id".to_string(), serde_json::json!(i.to_string()));
meta.insert("doc_num".to_string(), serde_json::json!(i));
meta.insert("topic".to_string(), serde_json::json!(topic));
(text, meta)
})
.collect()
}
#[allow(dead_code)]
pub fn diverse_documents() -> Vec<(String, HashMap<String, serde_json::Value>)> {
let texts = [
"The quick brown fox jumps over the lazy dog in the sunny meadow",
"Python programming language is widely used for machine learning and data science",
"The weather forecast predicts heavy rainfall and thunderstorms tomorrow",
"Database indexing improves query performance significantly in large datasets",
"Cooking Italian pasta requires fresh ingredients and proper timing",
"The stock market experienced a significant downturn last quarter",
"Neural networks and deep learning have revolutionized artificial intelligence",
"The ancient Egyptian pyramids were built over four thousand years ago",
"Cloud computing services offer scalable infrastructure for modern applications",
"Marine biology studies the diverse ecosystems found in the world oceans",
];
texts
.iter()
.enumerate()
.map(|(i, text)| {
let mut meta = HashMap::new();
meta.insert("id".to_string(), serde_json::json!(i.to_string()));
meta.insert("doc_num".to_string(), serde_json::json!(i));
(text.to_string(), meta)
})
.collect()
}
pub fn build_test_index(
n_docs: usize,
dir: &Path,
compact: bool,
recompute: bool,
) -> Result<PathBuf> {
let provider = FakeEmbeddingProvider::new(64);
let docs = sample_documents(n_docs);
let mut builder = leann_core::LeannBuilder::new("fake-test-model", Some(64), "test");
builder = builder
.with_m(16)
.with_ef_construction(40)
.with_compact(compact)
.with_recompute(recompute)
.with_distance_metric(leann_core::index::DistanceMetric::L2);
for (text, meta) in &docs {
builder.add_text(text, meta.clone());
}
let index_path = dir.join("test_index");
builder.build_index(&index_path, &provider)?;
Ok(index_path)
}