genegraph_storage/
traits.rs

1use log::debug;
2use smartcore::linalg::basic::matrix::DenseMatrix;
3use sprs::CsMat;
4use std::path::{Path, PathBuf};
5
6use crate::StorageResult;
7use crate::metadata::GeneMetadata;
8
9/// Async storage backend for Lance-based graph and embedding data.
10///
11/// This trait defines the minimal async API required to persist and reload
12/// all artifacts used by Javelin:
13///
14/// - Dense matrices (embeddings, eigenmaps, energy maps)
15/// - Sparse matrices in CSR form (e.g. Laplacians, adjacency)
16/// - Scalar vectors (eigenvalues, norms, generic f64 sequences)
17/// - Index-like vectors (usize mappings and cluster assignments)
18/// - Clustering metadata (centroid maps, subcentroids, lambdas)
19/// - Global metadata describing the dataset layout and dimensions
20///
21/// ## Initialization
22///
23/// Storage must be initialized before saving any data:
24///
25/// 1. Call `save_metadata()` once to write an initial `*_metadata.json`.
26/// 2. Subsequent `save_*` calls validate that metadata exists and is consistent.
27/// 3. `exists()` can be used to detect and reuse an existing initialized store.
28///
29/// Filenames are conventionally:
30///
31/// ```ignore
32/// <base dir>/<instance name or name id>_<key>.lance
33/// ```
34///
35/// ## Async usage
36///
37/// All I/O functions are async and intended to be called from a Tokio runtime.
38/// Implementations (e.g. `LanceStorage`) must not create their own runtimes or
39/// block on I/O internally.
40///
41/// ## High-level flow
42///
43/// - Dense data:
44///   - `save_dense("raw_input", &matrix, md_path)`
45///   - `load_dense("raw_input")`
46///
47/// - Sparse data:
48///   - `save_sparse("laplacian", &csr, md_path)`
49///   - `load_sparse("laplacian")`
50///
51/// - Scalars and indices:
52///   - `save_lambdas`, `load_lambdas`
53///   - `save_vector`, `load_vector`
54///   - `save_index`, `load_index`
55///   - `save_centroid_map`, `load_centroid_map`
56///   - `save_item_norms`, `load_item_norms`
57///   - `save_cluster_assignments`, `load_cluster_assignments`
58///
59/// - Clustering structure:
60///   - `save_subcentroids`, `load_subcentroids`
61///   - `save_subcentroid_lambdas`, `load_subcentroid_lambdas`
62///
63/// Implementations are free to choose the on-disk layout as long as they honor
64/// these logical keys and round-trip semantics.
65pub trait StorageBackend: Send + Sync {
66    /// Base directory of the instance
67    fn get_base(&self) -> String;
68    /// Name of the instance
69    fn get_name(&self) -> String;
70
71    fn path_to_uri(path: &Path) -> String;
72
73    ///
74    /// Returns `true` and the path to the metadata file if metadata file exists and is valid,
75    /// `false` otherwise.
76    /// This is used to avoid overwriting existing indexes.
77    fn exists(path: &str) -> (bool, Option<PathBuf>) {
78        let base_path = std::path::PathBuf::from(path);
79        if !base_path.exists() {
80            debug!("StorageBackend: path {:?} does not exist", base_path);
81            return (false, None);
82        }
83
84        // Check for any _metadata.json file in the directory
85        if let Ok(entries) = std::fs::read_dir(&base_path) {
86            for entry in entries.flatten() {
87                let path = entry.path();
88                if let Some(name) = path.file_name().and_then(|n| n.to_str())
89                    && name.ends_with("_metadata.json")
90                {
91                    debug!("StorageBackend::exists: found metadata file at {:?}", path);
92                    return (true, Some(path));
93                }
94            }
95        }
96        (false, None)
97    }
98
99    /// Returns the base directory path.
100    fn base_path(&self) -> PathBuf;
101    /// Returns the metadata path.
102    fn metadata_path(&self) -> PathBuf;
103    /// return the base path as file:// string
104    fn basepath_to_uri(&self) -> String;
105
106    /// Load initial data using columnar format from a file path.
107    /// Implementations may use this as a helper for async `load_dense`.
108    async fn load_dense_from_file(&self, path: &Path) -> StorageResult<DenseMatrix<f64>>;
109
110    /// Compute the full Lance/parquet file path for a logical filetype.
111    fn file_path(&self, key: &str) -> PathBuf;
112
113    // =========
114    // ASYNC API
115    // =========
116
117    /// Saves a dense matrix. Requires metadata to exist.
118    async fn save_dense(
119        &self,
120        key: &str,
121        matrix: &DenseMatrix<f64>,
122        md_path: &Path,
123    ) -> StorageResult<()>;
124
125    /// Loads a dense matrix from storage.
126    async fn load_dense(&self, key: &str) -> StorageResult<DenseMatrix<f64>>;
127
128    /// Saves a sparse matrix. Requires metadata to exist.
129    async fn save_sparse(
130        &self,
131        key: &str,
132        matrix: &CsMat<f64>,
133        md_path: &Path,
134    ) -> StorageResult<()>;
135
136    /// Loads a sparse matrix from storage.
137    async fn load_sparse(&self, key: &str) -> StorageResult<CsMat<f64>>;
138
139    /// Saves lambda eigenvalues. Requires metadata to exist.
140    async fn save_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
141
142    /// Loads lambda eigenvalues from storage.
143    async fn load_lambdas(&self) -> StorageResult<Vec<f64>>;
144
145    /// Initializes storage by saving metadata. Must be called first.
146    async fn save_metadata(&self, metadata: &GeneMetadata) -> StorageResult<PathBuf>;
147
148    /// Loads metadata from storage.
149    async fn load_metadata(&self) -> StorageResult<GeneMetadata>;
150
151    /// Save vectors that are not lambdas but indices.
152    #[allow(dead_code)]
153    async fn save_index(&self, key: &str, vector: &[usize], md_path: &Path) -> StorageResult<()>;
154
155    /// save a generic f64 sequence
156    async fn save_vector(&self, key: &str, vector: &[f64], md_path: &Path) -> StorageResult<()>;
157
158    /// Save centroid_map (vector of usize mapping items to centroids)
159    async fn save_centroid_map(&self, map: &[usize], md_path: &Path) -> StorageResult<()>;
160
161    /// Load centroid_map
162    async fn load_centroid_map(&self) -> StorageResult<Vec<usize>>;
163    /// Save subcentroid_lambdas (tau values for subcentroids)
164    async fn save_subcentroid_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
165    /// Load subcentroid_lambdas
166    async fn load_subcentroid_lambdas(&self) -> StorageResult<Vec<f64>>;
167    /// Save subcentroids (dense matrix)
168    async fn save_subcentroids(
169        &self,
170        subcentroids: &DenseMatrix<f64>,
171        md_path: &Path,
172    ) -> StorageResult<()>;
173    /// Load subcentroids
174    async fn load_subcentroids(&self) -> StorageResult<Vec<Vec<f64>>>;
175
176    /// Save item norms (precomputed L2 norms for fast distance computation)
177    async fn save_item_norms(&self, item_norms: &[f64], md_path: &Path) -> StorageResult<()>;
178
179    /// Load item norms
180    async fn load_item_norms(&self) -> StorageResult<Vec<f64>>;
181
182    /// Save cluster assignments (Vec<Option<usize>>)
183    async fn save_cluster_assignments(
184        &self,
185        assignments: &[Option<usize>],
186        md_path: &Path,
187    ) -> StorageResult<()>;
188
189    /// Load cluster assignments
190    async fn load_cluster_assignments(&self) -> StorageResult<Vec<Option<usize>>>;
191
192    /// Load index or generic usize vector from storage.
193    #[allow(dead_code)]
194    async fn load_index(&self, key: &str) -> StorageResult<Vec<usize>>;
195
196    async fn load_vector(&self, key: &str) -> StorageResult<Vec<f64>>;
197
198    async fn save_dense_to_file(data: &DenseMatrix<f64>, path: &PathBuf) -> StorageResult<()>;
199}