genegraph_storage/traits.rs
1use log::debug;
2use smartcore::linalg::basic::matrix::DenseMatrix;
3use sprs::CsMat;
4use std::path::{Path, PathBuf};
5
6use crate::StorageResult;
7use crate::metadata::GeneMetadata;
8
9/// Storage backend trait for persisting ArrowSpace graph embeddings.
10///
11/// ## Initialization Protocol
12///
13/// Storage must be initialized before data can be saved:
14///
15/// 1. Call `save_metadata()` or `save_eigenmaps_all()`/`save_energymaps_all()` first
16/// 2. Only then can individual `save_dense()`, `save_sparse()`, `save_lambdas()` be called
17/// 3. All save operations validate that metadata exists
18///
19/// Filename format is : <base dir> / <instance name>_<stem>.lance
20///
21/// ## Async usage
22///
23/// This trait is now async-first for all I/O methods. Implementations (e.g. `LanceStorage`)
24/// must integrate with Tokio by providing non-blocking async methods; no `block_on` or
25/// nested runtimes are used inside the backend.
26///
27/// ## Example
28///
29/// ```ignore
30/// let storage = LanceStorage::new(base, name);
31/// let builder = ArrowSpaceBuilder::new();
32/// let (mut aspace, gl) = builder.build_for_persistence(data, "Eigen", None);
33///
34/// // This initializes the storage directory with metadata and writes all artifacts
35/// storage.save_eigenmaps_all(&builder, &mut aspace, &gl).await?;
36///
37/// // Now individual loads will work
38/// let raw = storage.load_dense("rawinput").await?;
39/// ```
40pub trait StorageBackend: Send + Sync {
41 /// Base directory of the instance
42 fn get_base(&self) -> String;
43 /// Name of the instance
44 fn get_name(&self) -> String;
45
46 fn path_to_uri(path: &Path) -> String;
47
48 ///
49 /// Returns `true` and the path to the metadata file if metadata file exists and is valid,
50 /// `false` otherwise.
51 /// This is used to avoid overwriting existing indexes.
52 fn exists(path: &str) -> (bool, Option<PathBuf>) {
53 let base_path = std::path::PathBuf::from(path);
54 if !base_path.exists() {
55 debug!("StorageBackend: path {:?} does not exist", base_path);
56 return (false, None);
57 }
58
59 // Check for any _metadata.json file in the directory
60 if let Ok(entries) = std::fs::read_dir(&base_path) {
61 for entry in entries.flatten() {
62 let path = entry.path();
63 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
64 if name.ends_with("_metadata.json") {
65 debug!("StorageBackend::exists: found metadata file at {:?}", path);
66 return (true, Some(path));
67 }
68 }
69 }
70 }
71 (false, None)
72 }
73
74 /// Returns the base directory path.
75 fn base_path(&self) -> PathBuf;
76 /// Returns the metadata path.
77 fn metadata_path(&self) -> PathBuf;
78
79 /// Load initial data using columnar format from a file path.
80 /// Implementations may use this as a helper for async `load_dense`.
81 async fn load_dense_from_file(&self, path: &Path) -> StorageResult<DenseMatrix<f64>>;
82
83 /// Compute the full Lance/parquet file path for a logical filetype.
84 fn file_path(&self, key: &str) -> PathBuf;
85
86 // =========
87 // ASYNC API
88 // =========
89
90 /// Saves a dense matrix. Requires metadata to exist.
91 async fn save_dense(
92 &self,
93 key: &str,
94 matrix: &DenseMatrix<f64>,
95 md_path: &Path,
96 ) -> StorageResult<()>;
97
98 /// Loads a dense matrix from storage.
99 async fn load_dense(&self, key: &str) -> StorageResult<DenseMatrix<f64>>;
100
101 /// Saves a sparse matrix. Requires metadata to exist.
102 async fn save_sparse(
103 &self,
104 key: &str,
105 matrix: &CsMat<f64>,
106 md_path: &Path,
107 ) -> StorageResult<()>;
108
109 /// Loads a sparse matrix from storage.
110 async fn load_sparse(&self, key: &str) -> StorageResult<CsMat<f64>>;
111
112 /// Saves lambda eigenvalues. Requires metadata to exist.
113 async fn save_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
114
115 /// Loads lambda eigenvalues from storage.
116 async fn load_lambdas(&self) -> StorageResult<Vec<f64>>;
117
118 /// Initializes storage by saving metadata. Must be called first.
119 async fn save_metadata(&self, metadata: &GeneMetadata) -> StorageResult<PathBuf>;
120
121 /// Loads metadata from storage.
122 async fn load_metadata(&self) -> StorageResult<GeneMetadata>;
123
124 /// Save vectors that are not lambdas but indices.
125 #[allow(dead_code)]
126 async fn save_index(&self, key: &str, vector: &[usize], md_path: &Path) -> StorageResult<()>;
127
128 /// save a generic f64 sequence
129 async fn save_vector(&self, key: &str, vector: &[f64], md_path: &Path) -> StorageResult<()>;
130
131 /// Save centroid_map (vector of usize mapping items to centroids)
132 async fn save_centroid_map(&self, map: &[usize], md_path: &Path) -> StorageResult<()>;
133
134 /// Load centroid_map
135 async fn load_centroid_map(&self) -> StorageResult<Vec<usize>>;
136 /// Save subcentroid_lambdas (tau values for subcentroids)
137 async fn save_subcentroid_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
138 /// Load subcentroid_lambdas
139 async fn load_subcentroid_lambdas(&self) -> StorageResult<Vec<f64>>;
140 /// Save subcentroids (dense matrix)
141 async fn save_subcentroids(
142 &self,
143 subcentroids: &DenseMatrix<f64>,
144 md_path: &Path,
145 ) -> StorageResult<()>;
146 /// Load subcentroids
147 async fn load_subcentroids(&self) -> StorageResult<Vec<Vec<f64>>>;
148
149 /// Save item norms (precomputed L2 norms for fast distance computation)
150 async fn save_item_norms(&self, item_norms: &[f64], md_path: &Path) -> StorageResult<()>;
151
152 /// Load item norms
153 async fn load_item_norms(&self) -> StorageResult<Vec<f64>>;
154
155 /// Save cluster assignments (Vec<Option<usize>>)
156 async fn save_cluster_assignments(
157 &self,
158 assignments: &[Option<usize>],
159 md_path: &Path,
160 ) -> StorageResult<()>;
161
162 /// Load cluster assignments
163 async fn load_cluster_assignments(&self) -> StorageResult<Vec<Option<usize>>>;
164
165 /// Load index or generic usize vector from storage.
166 #[allow(dead_code)]
167 async fn load_index(&self, key: &str) -> StorageResult<Vec<usize>>;
168
169 async fn load_vector(&self, key: &str) -> StorageResult<Vec<f64>>;
170
171 #[cfg(test)]
172 async fn save_dense_to_file(data: &DenseMatrix<f64>, path: &PathBuf) -> StorageResult<()>;
173}