genegraph_storage/
metadata.rs

1#![allow(dead_code)]
2
3use log::{debug, info};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use std::fs;
7use std::path::PathBuf;
8
9use crate::StorageError;
10use crate::traits::backend::StorageBackend;
11use crate::traits::metadata::Metadata;
12
13/// Represent a single file spec in the persistence directory
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct FileInfo {
16    /// name of the file, can be equal to filetype if there is only one per type
17    pub filename: String,
18    /// see `Self::which_filetype(..)`: "rawinput" | "sub_centroids" | "lambdas" | "..."
19    pub filetype: String,
20    /// see `Self::which_format(..)`
21    pub storage_format: String,
22    pub rows: usize,
23    pub cols: usize,
24    pub nnz: Option<usize>,
25    pub size_bytes: Option<u64>,
26}
27
28impl FileInfo {
29    /// Create a file spec to add to the persistence directory
30    pub fn new(
31        filename: String,
32        filetype: &str,
33        data_shape: (usize, usize),
34        nnz: Option<usize>,
35        size_bytes: Option<u64>,
36    ) -> Self {
37        debug!(
38            "FileInfo::new: filename={}, filetype={}, shape={}x{}, nnz={:?}",
39            filename, filetype, data_shape.0, data_shape.1, nnz
40        );
41        Self {
42            filename,
43            filetype: filetype.into(),
44            storage_format: Self::which_format(filetype),
45            rows: data_shape.0,
46            cols: data_shape.1,
47            nnz,
48            size_bytes,
49        }
50    }
51
52    /// Assign the right format to the file type
53    pub fn which_format(filetype: &str) -> String {
54        match filetype {
55            "dense" => String::from("lance fixed-row"),
56            "sparse" => String::from("lance row-major"),
57            "vector" => String::from("lance row-major"),
58            _ => panic!("filetype not recognised {}", filetype),
59        }
60    }
61
62    /// Assign the right filetype to the keyname of the file
63    pub fn which_filetype(filetype: &str) -> String {
64        match filetype {
65            "rawinput" | "sub_centroids" => String::from("dense"),
66            "adjacency" | "laplacian" | "signals" => String::from("sparse"),
67            "lambdas" | "item_norms" | "norms" => String::from("vector"),
68            _ => panic!("key not recognised {}", filetype),
69        }
70    }
71}
72
73/// Metadata for an ArrowSpace index persisted to Lance storage.
74///
75/// Tracks dataset dimensions, builder configuration, file locations, and pipeline context.
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct GeneMetadata {
78    pub name_id: String,
79    pub nrows: usize,
80    pub ncols: usize,
81    pub base: String,
82    pub files: HashMap<String, FileInfo>,
83    pub created_at: String,
84}
85
86impl GeneMetadata {
87    /// Read metadata file from JSON
88    pub async fn read(path: PathBuf) -> Result<Self, StorageError> {
89        info!("Reading metadata from {:?}", path);
90        let s = fs::read_to_string(path).map_err(|e| StorageError::Io(e.to_string()))?;
91        let md: GeneMetadata = serde_json::from_str(&s).map_err(StorageError::Serde)?;
92        info!("Metadata read successfully");
93        Ok(md)
94    }
95}
96
97impl Metadata for GeneMetadata {
98    /// Empty metadata object
99    /// do not use in test, use seed_metadata_eigen instead
100    fn new(name_id: &str) -> Self {
101        info!("GeneMetadata::new: creating metadata for '{}'", name_id);
102        Self {
103            name_id: name_id.to_string(),
104            nrows: 0,
105            ncols: 0,
106            base: String::from(""),
107            files: HashMap::new(),
108            created_at: chrono::Utc::now().to_rfc3339(),
109        }
110    }
111
112    fn new_fileinfo(
113        &self,
114        key: &str,
115        filetype: &str,
116        data_shape: (usize, usize),
117        nnz: Option<usize>,
118        size_bytes: Option<u64>,
119    ) -> FileInfo {
120        FileInfo::new(
121            format!("{}_{}.lance", self.name_id, key),
122            filetype,
123            (data_shape.0, data_shape.1),
124            nnz,
125            size_bytes,
126        )
127    }
128
129    /// Standard pipeline object
130    async fn seed_metadata<B: StorageBackend>(
131        name_id: &str,
132        nitems: usize,
133        nfeatures: usize,
134        storage: &B,
135    ) -> Result<GeneMetadata, StorageError> {
136        info!(
137            "GeneMetadata::seed_metadata: seeding metadata for '{}' with nitems={}, nfeatures={}",
138            name_id, nitems, nfeatures
139        );
140
141        let mut md = Self::new(name_id)
142            .with_base(storage.base_path())
143            .with_dimensions(nitems, nfeatures);
144
145        debug!("GeneMetadata::seed_metadata: registering files");
146
147        let (key, filetype, rows, cols, nnz) = ("rawinput", "dense", nitems, nfeatures, None);
148        debug!(
149            "SpaceMetadata::seed_metadata_eigen: adding file {} ({}x{}, nnz={:?})",
150            filetype, rows, cols, nnz
151        );
152        md = md.add_file(
153            key,
154            FileInfo::new(
155                format!("{}_{}.lance", name_id, key),
156                filetype,
157                (rows, cols),
158                nnz,
159                None,
160            ),
161        );
162
163        debug!("GeneMetadata::seed_metadata: saving metadata to storage");
164        storage.save_metadata(&md).await?;
165
166        info!(
167            "GeneMetadata::seed_metadata: metadata seeded successfully for '{}'",
168            name_id
169        );
170        Ok(md)
171    }
172
173    fn with_base(mut self, base_path: PathBuf) -> Self {
174        self.base = base_path.to_string_lossy().to_string();
175        self
176    }
177
178    fn with_dimensions(mut self, rows: usize, cols: usize) -> Self {
179        debug!(
180            "GeneMetadata::with_dimensions: setting dimensions to {}x{}",
181            rows, cols
182        );
183        self.nrows = rows;
184        self.ncols = cols;
185        self
186    }
187
188    fn add_file(mut self, key: &str, info: FileInfo) -> Self {
189        debug!(
190            "GeneMetadata::add_file: adding file '{}' ({})",
191            key, info.filename
192        );
193        self.files.insert(key.to_string(), info);
194        self
195    }
196}