genegraph_storage/
metadata.rs

1#![allow(dead_code)]
2
3use log::{debug, info};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::fs;
7use std::path::PathBuf;
8
9use crate::StorageError;
10use crate::traits::backend::StorageBackend;
11use crate::traits::metadata::Metadata;
12
13/// Represent a single file spec in the persistence directory
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct FileInfo {
16    /// name of the file, can be equal to filetype if there is only one per type
17    pub filename: String,
18    /// see `Self::which_filetype(..)`: "rawinput" | "sub_centroids" | "lambdas" | "..."
19    pub filetype: String,
20    /// see `Self::which_format(..)`
21    pub storage_format: String,
22    pub rows: usize,
23    pub cols: usize,
24    pub nnz: Option<usize>,
25    pub size_bytes: Option<u64>,
26}
27
28impl FileInfo {
29    /// Create a file spec to add to the persistence directory
30    pub fn new(
31        filename: String,
32        filetype: &str,
33        data_shape: (usize, usize),
34        nnz: Option<usize>,
35        size_bytes: Option<u64>,
36    ) -> Self {
37        debug!(
38            "FileInfo::new: filename={}, filetype={}, shape={}x{}, nnz={:?}",
39            filename, filetype, data_shape.0, data_shape.1, nnz
40        );
41        Self {
42            filename,
43            filetype: filetype.into(),
44            storage_format: Self::which_format(filetype),
45            rows: data_shape.0,
46            cols: data_shape.1,
47            nnz,
48            size_bytes,
49        }
50    }
51
52    /// Assign the right format to the file type
53    pub fn which_format(filetype: &str) -> String {
54        match filetype {
55            "dense" => String::from("lance fixed-row"),
56            "sparse" => String::from("lance row-major"),
57            "vector" => String::from("lance row-major"),
58            _ => panic!("filetype not recognised {}", filetype),
59        }
60    }
61
62    /// Assign the right filetype to the keyname of the file
63    pub fn which_filetype(filetype: &str) -> String {
64        match filetype {
65            "rawinput" | "sub_centroids" | "dense" => String::from("dense"),
66            "adjacency" | "laplacian" | "signals" | "sparse" => String::from("sparse"),
67            "lambdas" | "item_norms" | "norms" | "vector" => String::from("vector"),
68            _ => panic!(
69                "Wrong filetype: use specific types or generic ('dense', 'sparse', 'vector')"
70            ),
71        }
72    }
73}
74
75/// Metadata for an ArrowSpace index persisted to Lance storage.
76///
77/// Tracks dataset dimensions, builder configuration, file locations, and pipeline context.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct GeneMetadata {
80    pub name_id: String,
81    pub nrows: usize,
82    pub ncols: usize,
83    pub base: String,
84    pub files: HashMap<String, FileInfo>,
85    pub created_at: String,
86}
87
88impl GeneMetadata {
89    /// Read metadata file from JSON
90    pub async fn read(path: PathBuf) -> Result<Self, StorageError> {
91        info!("Reading metadata from {:?}", path);
92        let s = fs::read_to_string(path).map_err(|e| StorageError::Io(e.to_string()))?;
93        let md: GeneMetadata = serde_json::from_str(&s).map_err(StorageError::Serde)?;
94        info!("Metadata read successfully");
95        Ok(md)
96    }
97}
98
99impl Metadata for GeneMetadata {
100    /// Empty metadata object
101    /// do not use in test, use seed_metadata_eigen instead
102    fn new(name_id: &str) -> Self {
103        info!("GeneMetadata::new: creating metadata for '{}'", name_id);
104        Self {
105            name_id: name_id.to_string(),
106            nrows: 0,
107            ncols: 0,
108            base: String::from(""),
109            files: HashMap::new(),
110            created_at: chrono::Utc::now().to_rfc3339(),
111        }
112    }
113
114    fn new_fileinfo(
115        &self,
116        key: &str,
117        filetype: &str,
118        data_shape: (usize, usize),
119        nnz: Option<usize>,
120        size_bytes: Option<u64>,
121    ) -> FileInfo {
122        FileInfo::new(
123            format!("{}_{}.lance", self.name_id, key),
124            filetype,
125            (data_shape.0, data_shape.1),
126            nnz,
127            size_bytes,
128        )
129    }
130
131    /// Standard pipeline object
132    async fn seed_metadata<B: StorageBackend>(
133        name_id: &str,
134        nitems: usize,
135        nfeatures: usize,
136        storage: &B,
137    ) -> Result<GeneMetadata, StorageError> {
138        info!(
139            "GeneMetadata::seed_metadata: seeding metadata for '{}' with nitems={}, nfeatures={}",
140            name_id, nitems, nfeatures
141        );
142
143        let md = Self::new(name_id)
144            .with_base(storage.base_path())
145            .with_dimensions(nitems, nfeatures);
146
147        debug!("GeneMetadata::seed_metadata: saving metadata to storage");
148        storage.save_metadata(&md).await?;
149
150        info!(
151            "GeneMetadata::seed_metadata: metadata seeded successfully for '{}'",
152            name_id
153        );
154        Ok(md)
155    }
156
157    fn with_base(mut self, base_path: PathBuf) -> Self {
158        self.base = base_path.to_string_lossy().to_string();
159        self
160    }
161
162    fn with_dimensions(mut self, rows: usize, cols: usize) -> Self {
163        debug!(
164            "GeneMetadata::with_dimensions: setting dimensions to {}x{}",
165            rows, cols
166        );
167        self.nrows = rows;
168        self.ncols = cols;
169        self
170    }
171
172    fn add_file(mut self, key: &str, info: FileInfo) -> Self {
173        debug!(
174            "GeneMetadata::add_file: adding file '{}' ({})",
175            key, info.filename
176        );
177        self.files.insert(key.to_string(), info);
178        self
179    }
180}