Skip to main content

leann_core/
index.rs

1use anyhow::{Context, Result};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::path::{Path, PathBuf};
5
6/// Represents the metadata stored in `<name>.meta.json` for a LEANN index.
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct IndexMeta {
9    pub version: String,
10    pub backend_name: String,
11    pub embedding_model: String,
12    pub dimensions: usize,
13    #[serde(default)]
14    pub backend_kwargs: HashMap<String, serde_json::Value>,
15    #[serde(default = "default_embedding_mode")]
16    pub embedding_mode: String,
17    #[serde(default)]
18    pub passage_sources: Vec<PassageSource>,
19    #[serde(default)]
20    pub embedding_options: HashMap<String, serde_json::Value>,
21    /// Whether the HNSW index uses compact CSR storage.
22    #[serde(default)]
23    pub is_compact: Option<bool>,
24    /// Whether embeddings have been pruned (for recompute mode).
25    #[serde(default)]
26    pub is_pruned: Option<bool>,
27    /// Total passages in the index (updated on append).
28    #[serde(default)]
29    pub total_passages: Option<usize>,
30    /// Set if built from pre-computed embeddings.
31    #[serde(default)]
32    pub built_from_precomputed_embeddings: Option<bool>,
33    #[serde(default)]
34    pub embeddings_source: Option<String>,
35}
36
37fn default_embedding_mode() -> String {
38    "sentence-transformers".to_string()
39}
40
41/// Describes a passage source (JSONL file + offset index).
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct PassageSource {
44    #[serde(rename = "type")]
45    pub source_type: String,
46    #[serde(default)]
47    pub path: String,
48    #[serde(default)]
49    pub index_path: String,
50    #[serde(default)]
51    pub path_relative: Option<String>,
52    #[serde(default)]
53    pub index_path_relative: Option<String>,
54}
55
56/// The distance metric used for vector similarity.
57#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "lowercase")]
59pub enum DistanceMetric {
60    #[default]
61    Mips,
62    L2,
63    Cosine,
64}
65
66impl DistanceMetric {
67    pub fn from_str_lossy(s: &str) -> Self {
68        match s.to_lowercase().as_str() {
69            "l2" => DistanceMetric::L2,
70            "cosine" => DistanceMetric::Cosine,
71            _ => DistanceMetric::Mips,
72        }
73    }
74}
75
76impl IndexMeta {
77    /// Load metadata from a `.meta.json` file.
78    pub fn load(path: &Path) -> Result<Self> {
79        let content =
80            std::fs::read_to_string(path).with_context(|| format!("reading {}", path.display()))?;
81        let meta: IndexMeta = serde_json::from_str(&content)
82            .with_context(|| format!("parsing meta.json at {}", path.display()))?;
83        Ok(meta)
84    }
85
86    /// Save metadata to a `.meta.json` file.
87    pub fn save(&self, path: &Path) -> Result<()> {
88        let content = serde_json::to_string_pretty(self)?;
89        std::fs::write(path, content)?;
90        Ok(())
91    }
92
93    /// Get the distance metric from backend kwargs.
94    pub fn distance_metric(&self) -> DistanceMetric {
95        self.backend_kwargs
96            .get("distance_metric")
97            .and_then(|v| v.as_str())
98            .map(DistanceMetric::from_str_lossy)
99            .unwrap_or_default()
100    }
101
102    /// Whether this index requires embedding recomputation at search time.
103    pub fn requires_recompute(&self) -> bool {
104        if let Some(pruned) = self.is_pruned {
105            return pruned;
106        }
107        self.backend_kwargs
108            .get("is_recompute")
109            .and_then(|v| v.as_bool())
110            .unwrap_or(true)
111    }
112}
113
114/// Resolve file paths associated with an index.
115pub struct IndexPaths {
116    pub base_dir: PathBuf,
117    pub index_name: String,
118}
119
120impl IndexPaths {
121    pub fn new(index_path: &Path) -> Self {
122        let base_dir = index_path.parent().unwrap_or(Path::new(".")).to_path_buf();
123        let index_name = index_path
124            .file_name()
125            .unwrap_or_default()
126            .to_string_lossy()
127            .to_string();
128        Self {
129            base_dir,
130            index_name,
131        }
132    }
133
134    pub fn meta_path(&self) -> PathBuf {
135        self.base_dir.join(format!("{}.meta.json", self.index_name))
136    }
137
138    pub fn passages_path(&self) -> PathBuf {
139        self.base_dir
140            .join(format!("{}.passages.jsonl", self.index_name))
141    }
142
143    pub fn offset_path(&self) -> PathBuf {
144        self.base_dir
145            .join(format!("{}.passages.idx", self.index_name))
146    }
147
148    pub fn index_file_path(&self) -> PathBuf {
149        // The index file uses the stem (without .leann extension)
150        let stem = self
151            .index_name
152            .strip_suffix(".leann")
153            .unwrap_or(&self.index_name);
154        self.base_dir.join(format!("{}.index", stem))
155    }
156
157    pub fn id_map_path(&self) -> PathBuf {
158        let stem = self
159            .index_name
160            .strip_suffix(".leann")
161            .unwrap_or(&self.index_name);
162        self.base_dir.join(format!("{}.ids.txt", stem))
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use std::io::Write;
170    use tempfile::NamedTempFile;
171
172    #[test]
173    fn test_index_meta_roundtrip() {
174        let meta = IndexMeta {
175            version: "1.0".to_string(),
176            backend_name: "hnsw".to_string(),
177            embedding_model: "facebook/contriever".to_string(),
178            dimensions: 768,
179            backend_kwargs: HashMap::new(),
180            embedding_mode: "sentence-transformers".to_string(),
181            passage_sources: vec![],
182            embedding_options: HashMap::new(),
183            is_compact: Some(true),
184            is_pruned: Some(true),
185            total_passages: None,
186            built_from_precomputed_embeddings: None,
187            embeddings_source: None,
188        };
189
190        let json = serde_json::to_string_pretty(&meta).unwrap();
191        let deserialized: IndexMeta = serde_json::from_str(&json).unwrap();
192        assert_eq!(deserialized.backend_name, "hnsw");
193        assert_eq!(deserialized.dimensions, 768);
194    }
195
196    #[test]
197    fn test_index_meta_load() {
198        let mut file = NamedTempFile::new().unwrap();
199        write!(
200            file,
201            r#"{{
202            "version": "1.0",
203            "backend_name": "hnsw",
204            "embedding_model": "test-model",
205            "dimensions": 384,
206            "passage_sources": []
207        }}"#
208        )
209        .unwrap();
210
211        let meta = IndexMeta::load(file.path()).unwrap();
212        assert_eq!(meta.embedding_model, "test-model");
213        assert_eq!(meta.dimensions, 384);
214        assert_eq!(meta.embedding_mode, "sentence-transformers");
215    }
216
217    #[test]
218    fn test_distance_metric_parsing() {
219        assert_eq!(DistanceMetric::from_str_lossy("mips"), DistanceMetric::Mips);
220        assert_eq!(DistanceMetric::from_str_lossy("l2"), DistanceMetric::L2);
221        assert_eq!(
222            DistanceMetric::from_str_lossy("cosine"),
223            DistanceMetric::Cosine
224        );
225        assert_eq!(
226            DistanceMetric::from_str_lossy("unknown"),
227            DistanceMetric::Mips
228        );
229    }
230
231    #[test]
232    fn test_index_paths() {
233        let paths = IndexPaths::new(Path::new("/data/my_index.leann"));
234        assert_eq!(paths.base_dir, Path::new("/data"));
235        assert_eq!(paths.index_name, "my_index.leann");
236        assert_eq!(
237            paths.meta_path(),
238            Path::new("/data/my_index.leann.meta.json")
239        );
240        assert_eq!(
241            paths.passages_path(),
242            Path::new("/data/my_index.leann.passages.jsonl")
243        );
244        assert_eq!(paths.index_file_path(), Path::new("/data/my_index.index"));
245        assert_eq!(paths.id_map_path(), Path::new("/data/my_index.ids.txt"));
246    }
247}