genegraph_storage/traits.rs
1use log::debug;
2use smartcore::linalg::basic::matrix::DenseMatrix;
3use sprs::CsMat;
4use std::path::{Path, PathBuf};
5
6use crate::StorageResult;
7use crate::metadata::GeneMetadata;
8
9/// Async storage backend for Lance-based graph and embedding data.
10///
11/// This trait defines the minimal async API required to persist and reload
12/// all artifacts used by Javelin:
13///
14/// - Dense matrices (embeddings, eigenmaps, energy maps)
15/// - Sparse matrices in CSR form (e.g. Laplacians, adjacency)
16/// - Scalar vectors (eigenvalues, norms, generic f64 sequences)
17/// - Index-like vectors (usize mappings and cluster assignments)
18/// - Clustering metadata (centroid maps, subcentroids, lambdas)
19/// - Global metadata describing the dataset layout and dimensions
20///
21/// ## Initialization
22///
23/// Storage must be initialized before saving any data:
24///
25/// 1. Call `save_metadata()` once to write an initial `*_metadata.json`.
26/// 2. Subsequent `save_*` calls validate that metadata exists and is consistent.
27/// 3. `exists()` can be used to detect and reuse an existing initialized store.
28///
29/// Filenames are conventionally:
30///
31/// ```ignore
32/// <base dir>/<instance name or name id>_<key>.lance
33/// ```
34///
35/// ## Async usage
36///
37/// All I/O functions are async and intended to be called from a Tokio runtime.
38/// Implementations (e.g. `LanceStorage`) must not create their own runtimes or
39/// block on I/O internally.
40///
41/// ## High-level flow
42///
43/// - Dense data:
44/// - `save_dense("raw_input", &matrix, md_path)`
45/// - `load_dense("raw_input")`
46///
47/// - Sparse data:
48/// - `save_sparse("laplacian", &csr, md_path)`
49/// - `load_sparse("laplacian")`
50///
51/// - Scalars and indices:
52/// - `save_lambdas`, `load_lambdas`
53/// - `save_vector`, `load_vector`
54/// - `save_index`, `load_index`
55/// - `save_centroid_map`, `load_centroid_map`
56/// - `save_item_norms`, `load_item_norms`
57/// - `save_cluster_assignments`, `load_cluster_assignments`
58///
59/// - Clustering structure:
60/// - `save_subcentroids`, `load_subcentroids`
61/// - `save_subcentroid_lambdas`, `load_subcentroid_lambdas`
62///
63/// Implementations are free to choose the on-disk layout as long as they honor
64/// these logical keys and round-trip semantics.
65pub trait StorageBackend: Send + Sync {
66 /// Base directory of the instance
67 fn get_base(&self) -> String;
68 /// Name of the instance
69 fn get_name(&self) -> String;
70
71 fn path_to_uri(path: &Path) -> String;
72
73 ///
74 /// Returns `true` and the path to the metadata file if metadata file exists and is valid,
75 /// `false` otherwise.
76 /// This is used to avoid overwriting existing indexes.
77 fn exists(path: &str) -> (bool, Option<PathBuf>) {
78 let base_path = std::path::PathBuf::from(path);
79 if !base_path.exists() {
80 debug!("StorageBackend: path {:?} does not exist", base_path);
81 return (false, None);
82 }
83
84 // Check for any _metadata.json file in the directory
85 if let Ok(entries) = std::fs::read_dir(&base_path) {
86 for entry in entries.flatten() {
87 let path = entry.path();
88 if let Some(name) = path.file_name().and_then(|n| n.to_str())
89 && name.ends_with("_metadata.json")
90 {
91 debug!("StorageBackend::exists: found metadata file at {:?}", path);
92 return (true, Some(path));
93 }
94 }
95 }
96 (false, None)
97 }
98
99 /// Returns the base directory path.
100 fn base_path(&self) -> PathBuf;
101 /// Returns the metadata path.
102 fn metadata_path(&self) -> PathBuf;
103 /// return the base path as file:// string
104 fn basepath_to_uri(&self) -> String;
105
106 /// Load initial data using columnar format from a file path.
107 /// Implementations may use this as a helper for async `load_dense`.
108 async fn load_dense_from_file(&self, path: &Path) -> StorageResult<DenseMatrix<f64>>;
109
110 /// Compute the full Lance/parquet file path for a logical filetype.
111 fn file_path(&self, key: &str) -> PathBuf;
112
113 // =========
114 // ASYNC API
115 // =========
116
117 /// Saves a dense matrix. Requires metadata to exist.
118 async fn save_dense(
119 &self,
120 key: &str,
121 matrix: &DenseMatrix<f64>,
122 md_path: &Path,
123 ) -> StorageResult<()>;
124
125 /// Loads a dense matrix from storage.
126 async fn load_dense(&self, key: &str) -> StorageResult<DenseMatrix<f64>>;
127
128 /// Saves a sparse matrix. Requires metadata to exist.
129 async fn save_sparse(
130 &self,
131 key: &str,
132 matrix: &CsMat<f64>,
133 md_path: &Path,
134 ) -> StorageResult<()>;
135
136 /// Loads a sparse matrix from storage.
137 async fn load_sparse(&self, key: &str) -> StorageResult<CsMat<f64>>;
138
139 /// Saves lambda eigenvalues. Requires metadata to exist.
140 async fn save_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
141
142 /// Loads lambda eigenvalues from storage.
143 async fn load_lambdas(&self) -> StorageResult<Vec<f64>>;
144
145 /// Initializes storage by saving metadata. Must be called first.
146 async fn save_metadata(&self, metadata: &GeneMetadata) -> StorageResult<PathBuf>;
147
148 /// Loads metadata from storage.
149 async fn load_metadata(&self) -> StorageResult<GeneMetadata>;
150
151 /// Save vectors that are not lambdas but indices.
152 #[allow(dead_code)]
153 async fn save_index(&self, key: &str, vector: &[usize], md_path: &Path) -> StorageResult<()>;
154
155 /// save a generic f64 sequence
156 async fn save_vector(&self, key: &str, vector: &[f64], md_path: &Path) -> StorageResult<()>;
157
158 /// Save centroid_map (vector of usize mapping items to centroids)
159 async fn save_centroid_map(&self, map: &[usize], md_path: &Path) -> StorageResult<()>;
160
161 /// Load centroid_map
162 async fn load_centroid_map(&self) -> StorageResult<Vec<usize>>;
163 /// Save subcentroid_lambdas (tau values for subcentroids)
164 async fn save_subcentroid_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
165 /// Load subcentroid_lambdas
166 async fn load_subcentroid_lambdas(&self) -> StorageResult<Vec<f64>>;
167 /// Save subcentroids (dense matrix)
168 async fn save_subcentroids(
169 &self,
170 subcentroids: &DenseMatrix<f64>,
171 md_path: &Path,
172 ) -> StorageResult<()>;
173 /// Load subcentroids
174 async fn load_subcentroids(&self) -> StorageResult<Vec<Vec<f64>>>;
175
176 /// Save item norms (precomputed L2 norms for fast distance computation)
177 async fn save_item_norms(&self, item_norms: &[f64], md_path: &Path) -> StorageResult<()>;
178
179 /// Load item norms
180 async fn load_item_norms(&self) -> StorageResult<Vec<f64>>;
181
182 /// Save cluster assignments (Vec<Option<usize>>)
183 async fn save_cluster_assignments(
184 &self,
185 assignments: &[Option<usize>],
186 md_path: &Path,
187 ) -> StorageResult<()>;
188
189 /// Load cluster assignments
190 async fn load_cluster_assignments(&self) -> StorageResult<Vec<Option<usize>>>;
191
192 /// Load index or generic usize vector from storage.
193 #[allow(dead_code)]
194 async fn load_index(&self, key: &str) -> StorageResult<Vec<usize>>;
195
196 async fn load_vector(&self, key: &str) -> StorageResult<Vec<f64>>;
197
198 async fn save_dense_to_file(data: &DenseMatrix<f64>, path: &PathBuf) -> StorageResult<()>;
199}