genegraph_storage/
traits.rs

1use log::debug;
2use smartcore::linalg::basic::matrix::DenseMatrix;
3use sprs::CsMat;
4use std::path::{Path, PathBuf};
5
6use crate::StorageResult;
7use crate::metadata::GeneMetadata;
8
9/// Storage backend trait for persisting ArrowSpace graph embeddings.
10///
11/// ## Initialization Protocol
12///
13/// Storage must be initialized before data can be saved:
14///
15/// 1. Call `save_metadata()` or `save_eigenmaps_all()`/`save_energymaps_all()` first
16/// 2. Only then can individual `save_dense()`, `save_sparse()`, `save_lambdas()` be called
17/// 3. All save operations validate that metadata exists
18///
19/// Filename format is : <base dir> / <instance name>_<stem>.lance
20///
21/// ## Async usage
22///
23/// This trait is now async-first for all I/O methods. Implementations (e.g. `LanceStorage`)
24/// must integrate with Tokio by providing non-blocking async methods; no `block_on` or
25/// nested runtimes are used inside the backend.
26///
27/// ## Example
28///
29/// ```ignore
30/// let storage = LanceStorage::new(base, name);
31/// let builder = ArrowSpaceBuilder::new();
32/// let (mut aspace, gl) = builder.build_for_persistence(data, "Eigen", None);
33///
34/// // This initializes the storage directory with metadata and writes all artifacts
35/// storage.save_eigenmaps_all(&builder, &mut aspace, &gl).await?;
36///
37/// // Now individual loads will work
38/// let raw = storage.load_dense("rawinput").await?;
39/// ```
40pub trait StorageBackend: Send + Sync {
41    /// Base directory of the instance
42    fn get_base(&self) -> String;
43    /// Name of the instance
44    fn get_name(&self) -> String;
45
46    fn path_to_uri(path: &Path) -> String;
47
48    ///
49    /// Returns `true` and the path to the metadata file if metadata file exists and is valid,
50    /// `false` otherwise.
51    /// This is used to avoid overwriting existing indexes.
52    fn exists(path: &str) -> (bool, Option<PathBuf>) {
53        let base_path = std::path::PathBuf::from(path);
54        if !base_path.exists() {
55            debug!("StorageBackend: path {:?} does not exist", base_path);
56            return (false, None);
57        }
58
59        // Check for any _metadata.json file in the directory
60        if let Ok(entries) = std::fs::read_dir(&base_path) {
61            for entry in entries.flatten() {
62                let path = entry.path();
63                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
64                    if name.ends_with("_metadata.json") {
65                        debug!("StorageBackend::exists: found metadata file at {:?}", path);
66                        return (true, Some(path));
67                    }
68                }
69            }
70        }
71        (false, None)
72    }
73
74    /// Returns the base directory path.
75    fn base_path(&self) -> PathBuf;
76    /// Returns the metadata path.
77    fn metadata_path(&self) -> PathBuf;
78
79    /// Load initial data using columnar format from a file path.
80    /// Implementations may use this as a helper for async `load_dense`.
81    async fn load_dense_from_file(&self, path: &Path) -> StorageResult<DenseMatrix<f64>>;
82
83    /// Compute the full Lance/parquet file path for a logical filetype.
84    fn file_path(&self, key: &str) -> PathBuf;
85
86    // =========
87    // ASYNC API
88    // =========
89
90    /// Saves a dense matrix. Requires metadata to exist.
91    async fn save_dense(
92        &self,
93        key: &str,
94        matrix: &DenseMatrix<f64>,
95        md_path: &Path,
96    ) -> StorageResult<()>;
97
98    /// Loads a dense matrix from storage.
99    async fn load_dense(&self, key: &str) -> StorageResult<DenseMatrix<f64>>;
100
101    /// Saves a sparse matrix. Requires metadata to exist.
102    async fn save_sparse(
103        &self,
104        key: &str,
105        matrix: &CsMat<f64>,
106        md_path: &Path,
107    ) -> StorageResult<()>;
108
109    /// Loads a sparse matrix from storage.
110    async fn load_sparse(&self, key: &str) -> StorageResult<CsMat<f64>>;
111
112    /// Saves lambda eigenvalues. Requires metadata to exist.
113    async fn save_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
114
115    /// Loads lambda eigenvalues from storage.
116    async fn load_lambdas(&self) -> StorageResult<Vec<f64>>;
117
118    /// Initializes storage by saving metadata. Must be called first.
119    async fn save_metadata(&self, metadata: &GeneMetadata) -> StorageResult<PathBuf>;
120
121    /// Loads metadata from storage.
122    async fn load_metadata(&self) -> StorageResult<GeneMetadata>;
123
124    /// Save vectors that are not lambdas but indices.
125    #[allow(dead_code)]
126    async fn save_index(&self, key: &str, vector: &[usize], md_path: &Path) -> StorageResult<()>;
127
128    /// save a generic f64 sequence
129    async fn save_vector(&self, key: &str, vector: &[f64], md_path: &Path) -> StorageResult<()>;
130
131    /// Save centroid_map (vector of usize mapping items to centroids)
132    async fn save_centroid_map(&self, map: &[usize], md_path: &Path) -> StorageResult<()>;
133
134    /// Load centroid_map
135    async fn load_centroid_map(&self) -> StorageResult<Vec<usize>>;
136    /// Save subcentroid_lambdas (tau values for subcentroids)
137    async fn save_subcentroid_lambdas(&self, lambdas: &[f64], md_path: &Path) -> StorageResult<()>;
138    /// Load subcentroid_lambdas
139    async fn load_subcentroid_lambdas(&self) -> StorageResult<Vec<f64>>;
140    /// Save subcentroids (dense matrix)
141    async fn save_subcentroids(
142        &self,
143        subcentroids: &DenseMatrix<f64>,
144        md_path: &Path,
145    ) -> StorageResult<()>;
146    /// Load subcentroids
147    async fn load_subcentroids(&self) -> StorageResult<Vec<Vec<f64>>>;
148
149    /// Save item norms (precomputed L2 norms for fast distance computation)
150    async fn save_item_norms(&self, item_norms: &[f64], md_path: &Path) -> StorageResult<()>;
151
152    /// Load item norms
153    async fn load_item_norms(&self) -> StorageResult<Vec<f64>>;
154
155    /// Save cluster assignments (Vec<Option<usize>>)
156    async fn save_cluster_assignments(
157        &self,
158        assignments: &[Option<usize>],
159        md_path: &Path,
160    ) -> StorageResult<()>;
161
162    /// Load cluster assignments
163    async fn load_cluster_assignments(&self) -> StorageResult<Vec<Option<usize>>>;
164
165    /// Load index or generic usize vector from storage.
166    #[allow(dead_code)]
167    async fn load_index(&self, key: &str) -> StorageResult<Vec<usize>>;
168
169    async fn load_vector(&self, key: &str) -> StorageResult<Vec<f64>>;
170
171    #[cfg(test)]
172    async fn save_dense_to_file(data: &DenseMatrix<f64>, path: &PathBuf) -> StorageResult<()>;
173}