Skip to main content

lance_table/format/
index.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Metadata for index
5
6use std::collections::HashMap;
7use std::sync::Arc;
8
9use chrono::{DateTime, Utc};
10use deepsize::DeepSizeOf;
11use futures::StreamExt;
12use lance_io::object_store::ObjectStore;
13use object_store::path::Path;
14use roaring::RoaringBitmap;
15use uuid::Uuid;
16
17use super::pb;
18use lance_core::{Error, Result};
19
20/// Metadata about a single file within an index segment.
21#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
22pub struct IndexFile {
23    /// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
24    pub path: String,
25    /// Size of the file in bytes
26    pub size_bytes: u64,
27}
28
29/// Index metadata
30#[derive(Debug, Clone, PartialEq)]
31pub struct IndexMetadata {
32    /// Unique ID across all dataset versions.
33    pub uuid: Uuid,
34
35    /// Fields to build the index.
36    pub fields: Vec<i32>,
37
38    /// Human readable index name
39    pub name: String,
40
41    /// The version of the dataset this index was last updated on
42    ///
43    /// This is set when the index is created (based on the version used to train the index)
44    /// This is updated when the index is updated or remapped
45    pub dataset_version: u64,
46
47    /// The fragment ids this index covers.
48    ///
49    /// This may contain fragment ids that no longer exist in the dataset.
50    ///
51    /// If this is None, then this is unknown.
52    pub fragment_bitmap: Option<RoaringBitmap>,
53
54    /// Metadata specific to the index type
55    ///
56    /// This is an Option because older versions of Lance may not have this defined.  However, it should always
57    /// be present in newer versions.
58    pub index_details: Option<Arc<prost_types::Any>>,
59
60    /// The index version.
61    pub index_version: i32,
62
63    /// Timestamp when the index was created
64    ///
65    /// This field is optional for backward compatibility. For existing indices created before
66    /// this field was added, this will be None.
67    pub created_at: Option<DateTime<Utc>>,
68
69    /// The base path index of the index files. Used when the index is imported or referred from another dataset.
70    /// Lance uses it as key of the base_paths field in Manifest to determine the actual base path of the index files.
71    pub base_id: Option<u32>,
72
73    /// List of files and their sizes for this index segment.
74    /// This enables skipping HEAD calls when opening indices and provides
75    /// visibility into index storage size via describe_indices().
76    /// This is None if the file sizes are unknown. This happens for indices created
77    /// before this field was added.
78    pub files: Option<Vec<IndexFile>>,
79}
80
81impl IndexMetadata {
82    pub fn effective_fragment_bitmap(
83        &self,
84        existing_fragments: &RoaringBitmap,
85    ) -> Option<RoaringBitmap> {
86        let fragment_bitmap = self.fragment_bitmap.as_ref()?;
87        Some(fragment_bitmap & existing_fragments)
88    }
89
90    /// Returns a map of relative file paths to their sizes.
91    /// Returns an empty map if file information is not available.
92    pub fn file_size_map(&self) -> HashMap<String, u64> {
93        self.files
94            .as_ref()
95            .map(|files| {
96                files
97                    .iter()
98                    .map(|f| (f.path.clone(), f.size_bytes))
99                    .collect()
100            })
101            .unwrap_or_default()
102    }
103
104    /// Returns the total size of all files in this index segment in bytes.
105    /// Returns None if file information is not available.
106    pub fn total_size_bytes(&self) -> Option<u64> {
107        self.files
108            .as_ref()
109            .map(|files| files.iter().map(|f| f.size_bytes).sum())
110    }
111
112    /// Returns the set of fragments which are part of the fragment bitmap
113    /// but no longer in the dataset.
114    pub fn deleted_fragment_bitmap(
115        &self,
116        existing_fragments: &RoaringBitmap,
117    ) -> Option<RoaringBitmap> {
118        let fragment_bitmap = self.fragment_bitmap.as_ref()?;
119        Some(fragment_bitmap - existing_fragments)
120    }
121}
122
123impl DeepSizeOf for IndexMetadata {
124    fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
125        self.uuid.as_bytes().deep_size_of_children(context)
126            + self.fields.deep_size_of_children(context)
127            + self.name.deep_size_of_children(context)
128            + self.dataset_version.deep_size_of_children(context)
129            + self
130                .fragment_bitmap
131                .as_ref()
132                .map(|fragment_bitmap| fragment_bitmap.serialized_size())
133                .unwrap_or(0)
134            + self.files.deep_size_of_children(context)
135    }
136}
137
138impl TryFrom<pb::IndexMetadata> for IndexMetadata {
139    type Error = Error;
140
141    fn try_from(proto: pb::IndexMetadata) -> Result<Self> {
142        let fragment_bitmap = if proto.fragment_bitmap.is_empty() {
143            None
144        } else {
145            Some(RoaringBitmap::deserialize_from(
146                &mut proto.fragment_bitmap.as_slice(),
147            )?)
148        };
149
150        let files = if proto.files.is_empty() {
151            None
152        } else {
153            Some(
154                proto
155                    .files
156                    .into_iter()
157                    .map(|f| IndexFile {
158                        path: f.path,
159                        size_bytes: f.size_bytes,
160                    })
161                    .collect(),
162            )
163        };
164
165        Ok(Self {
166            uuid: proto.uuid.as_ref().map(Uuid::try_from).ok_or_else(|| {
167                Error::invalid_input("uuid field does not exist in Index metadata".to_string())
168            })??,
169            name: proto.name,
170            fields: proto.fields,
171            dataset_version: proto.dataset_version,
172            fragment_bitmap,
173            index_details: proto.index_details.map(Arc::new),
174            index_version: proto.index_version.unwrap_or_default(),
175            created_at: proto.created_at.map(|ts| {
176                DateTime::from_timestamp_millis(ts as i64)
177                    .expect("Invalid timestamp in index metadata")
178            }),
179            base_id: proto.base_id,
180            files,
181        })
182    }
183}
184
185impl From<&IndexMetadata> for pb::IndexMetadata {
186    fn from(idx: &IndexMetadata) -> Self {
187        let mut fragment_bitmap = Vec::new();
188        if let Some(bitmap) = &idx.fragment_bitmap
189            && let Err(e) = bitmap.serialize_into(&mut fragment_bitmap)
190        {
191            // In theory, this should never error. But if we do, just
192            // recover gracefully.
193            log::error!("Failed to serialize fragment bitmap: {}", e);
194            fragment_bitmap.clear();
195        }
196
197        let files = idx
198            .files
199            .as_ref()
200            .map(|files| {
201                files
202                    .iter()
203                    .map(|f| pb::IndexFile {
204                        path: f.path.clone(),
205                        size_bytes: f.size_bytes,
206                    })
207                    .collect()
208            })
209            .unwrap_or_default();
210
211        Self {
212            uuid: Some((&idx.uuid).into()),
213            name: idx.name.clone(),
214            fields: idx.fields.clone(),
215            dataset_version: idx.dataset_version,
216            fragment_bitmap,
217            index_details: idx
218                .index_details
219                .as_ref()
220                .map(|details| details.as_ref().clone()),
221            index_version: Some(idx.index_version),
222            created_at: idx.created_at.map(|dt| dt.timestamp_millis() as u64),
223            base_id: idx.base_id,
224            files,
225        }
226    }
227}
228
229/// List all files in an index directory with their sizes.
230///
231/// Returns a list of `IndexFile` structs containing relative paths and sizes.
232/// This is used to capture file metadata after index creation/modification.
233pub async fn list_index_files_with_sizes(
234    object_store: &ObjectStore,
235    index_dir: &Path,
236) -> Result<Vec<IndexFile>> {
237    let mut files = Vec::new();
238    let mut stream = object_store.read_dir_all(index_dir, None);
239    while let Some(meta) = stream.next().await {
240        let meta = meta?;
241        // Get relative path by stripping the index_dir prefix
242        let relative_path = meta
243            .location
244            .as_ref()
245            .strip_prefix(index_dir.as_ref())
246            .map(|s| s.trim_start_matches('/').to_string())
247            .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string());
248        files.push(IndexFile {
249            path: relative_path,
250            size_bytes: meta.size,
251        });
252    }
253    Ok(files)
254}