genegraph_storage/
metadata.rs

1#![allow(dead_code)]
2
3use log::{debug, info};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::fs;
7use std::path::PathBuf;
8
9use crate::StorageError;
10use crate::traits::StorageBackend;
11
12/// Represent a single file spec in the persistence directory
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct FileInfo {
15    /// name of the file, can be equal to filetype if there is only one per type
16    pub filename: String,
17    /// see `Self::which_filetype(..)`: "rawinput" | "sub_centroids" | "lambdas" | "..."
18    pub filetype: String,
19    /// see `Self::which_format(..)`
20    pub storage_format: String,
21    pub rows: usize,
22    pub cols: usize,
23    pub nnz: Option<usize>,
24    pub size_bytes: Option<u64>,
25}
26
27impl FileInfo {
28    /// Create a file spec to add to the persistence directory
29    pub(crate) fn new(
30        filename: String,
31        filetype: &str,
32        data_shape: (usize, usize),
33        nnz: Option<usize>,
34        size_bytes: Option<u64>,
35    ) -> Self {
36        debug!(
37            "FileInfo::new: filename={}, filetype={}, shape={}x{}, nnz={:?}",
38            filename, filetype, data_shape.0, data_shape.1, nnz
39        );
40        Self {
41            filename,
42            filetype: filetype.into(),
43            storage_format: Self::which_format(filetype),
44            rows: data_shape.0,
45            cols: data_shape.1,
46            nnz,
47            size_bytes,
48        }
49    }
50
51    /// Assign the right format to the file type
52    pub fn which_format(filetype: &str) -> String {
53        match filetype {
54            "dense" => String::from("lance fixed-row"),
55            "sparse" => String::from("lance row-major"),
56            "vector" => String::from("lance row-major"),
57            _ => panic!("filetype not recognised {}", filetype),
58        }
59    }
60
61    /// Assign the right filetype to the keyname of the file
62    pub fn which_filetype(filetype: &str) -> String {
63        match filetype {
64            "rawinput" | "sub_centroids" => String::from("dense"),
65            "adjacency" | "laplacian" | "signals" => String::from("sparse"),
66            "lambdas" | "item_norms" | "norms" => String::from("vector"),
67            _ => panic!("key not recognised {}", filetype),
68        }
69    }
70}
71
72/// Metadata for an ArrowSpace index persisted to Lance storage.
73///
74/// Tracks dataset dimensions, builder configuration, file locations, and pipeline context.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct GeneMetadata {
77    pub name_id: String,
78    pub nrows: usize,
79    pub ncols: usize,
80    pub base: String,
81    pub files: HashMap<String, FileInfo>,
82    pub created_at: String,
83}
84
85impl GeneMetadata {
86    /// Empty metadata object
87    /// do not use in test, use seed_metadata_eigen instead
88    pub fn new(name_id: &str) -> Self {
89        info!("GeneMetadata::new: creating metadata for '{}'", name_id);
90        Self {
91            name_id: name_id.to_string(),
92            nrows: 0,
93            ncols: 0,
94            base: String::from(""),
95            files: HashMap::new(),
96            created_at: chrono::Utc::now().to_rfc3339(),
97        }
98    }
99
100    pub fn new_fileinfo(
101        &self,
102        key: &str,
103        filetype: &str,
104        data_shape: (usize, usize),
105        nnz: Option<usize>,
106        size_bytes: Option<u64>,
107    ) -> FileInfo {
108        FileInfo::new(
109            format!("{}_{}.lance", self.name_id, key),
110            filetype,
111            (data_shape.0, data_shape.1),
112            nnz,
113            size_bytes,
114        )
115    }
116
117    /// Read metadata file from JSON
118    pub async fn read(path: PathBuf) -> Result<Self, StorageError> {
119        info!("Reading metadata from {:?}", path);
120        let s = fs::read_to_string(path).map_err(|e| StorageError::Io(e.to_string()))?;
121        let md: GeneMetadata = serde_json::from_str(&s).map_err(StorageError::Serde)?;
122        info!("Metadata read successfully");
123        Ok(md)
124    }
125
126    /// Standard pipeline object
127    pub async fn seed_metadata<B: StorageBackend>(
128        name_id: &str,
129        nitems: usize,
130        nfeatures: usize,
131        storage: &B,
132    ) -> Result<GeneMetadata, StorageError> {
133        info!(
134            "GeneMetadata::seed_metadata: seeding metadata for '{}' with nitems={}, nfeatures={}",
135            name_id, nitems, nfeatures
136        );
137
138        let mut md = Self::new(name_id)
139            .with_base(storage.base_path())
140            .with_dimensions(nitems, nfeatures);
141
142        debug!("GeneMetadata::seed_metadata: registering files");
143
144        let (key, filetype, rows, cols, nnz) = ("rawinput", "dense", nitems, nfeatures, None);
145        debug!(
146            "SpaceMetadata::seed_metadata_eigen: adding file {} ({}x{}, nnz={:?})",
147            filetype, rows, cols, nnz
148        );
149        md = md.add_file(
150            key,
151            FileInfo::new(
152                format!("{}_{}.lance", name_id, key),
153                filetype,
154                (rows, cols),
155                nnz,
156                None,
157            ),
158        );
159
160        debug!("GeneMetadata::seed_metadata: saving metadata to storage");
161        storage.save_metadata(&md).await?;
162
163        info!(
164            "GeneMetadata::seed_metadata: metadata seeded successfully for '{}'",
165            name_id
166        );
167        Ok(md)
168    }
169
170    pub fn with_base(mut self, base_path: PathBuf) -> Self {
171        self.base = base_path.to_string_lossy().to_string();
172        self
173    }
174
175    pub fn with_dimensions(mut self, rows: usize, cols: usize) -> Self {
176        debug!(
177            "GeneMetadata::with_dimensions: setting dimensions to {}x{}",
178            rows, cols
179        );
180        self.nrows = rows;
181        self.ncols = cols;
182        self
183    }
184
185    pub fn add_file(mut self, key: &str, info: FileInfo) -> Self {
186        debug!(
187            "GeneMetadata::add_file: adding file '{}' ({})",
188            key, info.filename
189        );
190        self.files.insert(key.to_string(), info);
191        self
192    }
193}