genegraph_storage/
metadata.rs

1#![allow(dead_code)]
2
3use log::{debug, info};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::fs;
7use std::path::PathBuf;
8
9use crate::StorageError;
10use crate::traits::StorageBackend;
11
12/// Represent a single file spec in the persistence directory
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct FileInfo {
15    /// name of the file, can be equal to filetype if there is only one per type
16    pub filename: String,
17    /// see `Self::which_filetype(..)`: "rawinput" | "sub_centroids" | "lambdas" | "..."
18    pub filetype: String,
19    /// see `Self::which_format(..)`
20    pub storage_format: String,
21    pub rows: usize,
22    pub cols: usize,
23    pub nnz: Option<usize>,
24    pub size_bytes: Option<u64>,
25}
26
27impl FileInfo {
28    /// Create a file spec to add to the persistence directory
29    pub(crate) fn new(
30        filename: String,
31        filetype: &str,
32        data_shape: (usize, usize),
33        nnz: Option<usize>,
34        size_bytes: Option<u64>,
35    ) -> Self {
36        debug!(
37            "FileInfo::new: filename={}, filetype={}, shape={}x{}, nnz={:?}",
38            filename, filetype, data_shape.0, data_shape.1, nnz
39        );
40        Self {
41            filename,
42            filetype: filetype.into(),
43            storage_format: Self::which_format(filetype),
44            rows: data_shape.0,
45            cols: data_shape.1,
46            nnz,
47            size_bytes,
48        }
49    }
50
51    /// Assign the right format to the file type
52    pub fn which_format(filetype: &str) -> String {
53        match filetype {
54            "dense" => String::from("lance fixed-row"),
55            "sparse" => String::from("lance row-major"),
56            "vector" => String::from("lance row-major"),
57            _ => panic!("filetype not recognised {}", filetype),
58        }
59    }
60
61    /// Assign the right filetype to the keyname of the file
62    pub fn which_filetype(filetype: &str) -> String {
63        match filetype {
64            "rawinput" | "sub_centroids" => String::from("dense"),
65            "adjacency" | "laplacian" | "signals" => String::from("sparse"),
66            "lambdas" | "item_norms" | "norms" => String::from("vector"),
67            _ => panic!("key not recognised {}", filetype),
68        }
69    }
70}
71
72/// Metadata for an ArrowSpace index persisted to Lance storage.
73///
74/// Tracks dataset dimensions, builder configuration, file locations, and pipeline context.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct GeneMetadata {
77    pub name_id: String,
78    pub nrows: usize,
79    pub ncols: usize,
80    pub base: String,
81    pub files: HashMap<String, FileInfo>,
82    pub created_at: String,
83}
84
85impl GeneMetadata {
86    /// Empty metadata object
87    /// do not use in test, use seed_metadata_eigen instead
88    fn new(name_id: &str) -> Self {
89        info!("SpaceMetadata::new: creating metadata for '{}'", name_id);
90        Self {
91            name_id: name_id.to_string(),
92            nrows: 0,
93            ncols: 0,
94            base: String::from(""),
95            files: HashMap::new(),
96            created_at: chrono::Utc::now().to_rfc3339(),
97        }
98    }
99
100    pub fn new_fileinfo(
101        &self,
102        key: &str,
103        filetype: &str,
104        data_shape: (usize, usize),
105        nnz: Option<usize>,
106        size_bytes: Option<u64>,
107    ) -> FileInfo {
108        FileInfo::new(
109            format!("{}_{}.lance", self.name_id, key),
110            filetype,
111            (data_shape.0, data_shape.1),
112            nnz,
113            size_bytes,
114        )
115    }
116
117    /// Read metadata file from JSON
118    pub async fn read(path: PathBuf) -> Result<Self, StorageError> {
119        info!("Reading metadata from {:?}", path);
120        let s = fs::read_to_string(path).map_err(|e| StorageError::Io(e.to_string()))?;
121        let md: GeneMetadata = serde_json::from_str(&s).map_err(StorageError::Serde)?;
122        info!("Metadata read successfully");
123        Ok(md)
124    }
125
126    /// Standard pipeline object
127    pub async fn seed_metadata<B: StorageBackend>(
128        name_id: &str,
129        nitems: usize,
130        nfeatures: usize,
131        storage: &B,
132    ) -> Result<GeneMetadata, StorageError> {
133        info!(
134            "SpaceMetadata::seed_metadata: seeding metadata for '{}' with nitems={}, nfeatures={}",
135            name_id, nitems, nfeatures
136        );
137
138        let mut md = Self::new(name_id)
139            .with_base(storage.base_path())
140            .with_dimensions(nitems, nfeatures);
141
142        debug!("SpaceMetadata::seed_metadata: registering files");
143
144        for (key, filetype, rows, cols, nnz) in [("rawinput", "dense", nitems, nfeatures, None)] {
145            debug!(
146                "SpaceMetadata::seed_metadata_eigen: adding file {} ({}x{}, nnz={:?})",
147                filetype, rows, cols, nnz
148            );
149            md = md.add_file(
150                key,
151                FileInfo::new(
152                    format!("{}_{}.lance", name_id, key),
153                    filetype,
154                    (rows, cols),
155                    nnz,
156                    None,
157                ),
158            );
159        }
160
161        debug!("SpaceMetadata::seed_metadata: saving metadata to storage");
162        storage.save_metadata(&md).await?;
163
164        info!(
165            "SpaceMetadata::seed_metadata: metadata seeded successfully for '{}'",
166            name_id
167        );
168        Ok(md)
169    }
170
171    pub fn with_base(mut self, base_path: PathBuf) -> Self {
172        self.base = base_path.to_string_lossy().to_string();
173        self
174    }
175
176    pub fn with_dimensions(mut self, rows: usize, cols: usize) -> Self {
177        debug!(
178            "SpaceMetadata::with_dimensions: setting dimensions to {}x{}",
179            rows, cols
180        );
181        self.nrows = rows;
182        self.ncols = cols;
183        self
184    }
185
186    pub fn add_file(mut self, key: &str, info: FileInfo) -> Self {
187        debug!(
188            "SpaceMetadata::add_file: adding file '{}' ({})",
189            key, info.filename
190        );
191        self.files.insert(key.to_string(), info);
192        self
193    }
194}