vortex_file/
file.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module defines the [`VortexFile`] struct, which represents a Vortex file on disk or in memory.
5//!
6//! The `VortexFile` provides methods for accessing file metadata, creating segment sources for reading
7//! data from the file, and initiating scans to read the file's contents into memory as Vortex arrays.
8
9use std::sync::Arc;
10
11use vortex_array::ArrayRef;
12use vortex_array::stats::StatsSet;
13use vortex_dtype::{DType, Field, FieldPath, FieldPathSet};
14use vortex_error::VortexResult;
15use vortex_expr::pruning::checked_pruning_expr;
16use vortex_expr::{ExprRef, Scope};
17use vortex_layout::LayoutReader;
18use vortex_layout::segments::SegmentSource;
19use vortex_metrics::VortexMetrics;
20use vortex_scan::ScanBuilder;
21use vortex_utils::aliases::hash_map::HashMap;
22
23use crate::footer::Footer;
24use crate::pruning::extract_relevant_file_stats_as_struct_row;
25
26/// Represents a Vortex file, providing access to its metadata and content.
27///
28/// A `VortexFile` is created by opening a Vortex file using [`VortexOpenOptions`](crate::VortexOpenOptions).
29/// It provides methods for accessing file metadata (such as row count, data type, and statistics)
30/// and for initiating scans to read the file's contents.
31#[derive(Clone)]
32pub struct VortexFile {
33    /// The footer of the Vortex file, containing metadata and layout information.
34    pub(crate) footer: Footer,
35    /// The segment source used to read segments from this file.
36    pub(crate) segment_source: Arc<dyn SegmentSource>,
37    /// Metrics tied to the file.
38    pub(crate) metrics: VortexMetrics,
39}
40
41impl VortexFile {
42    /// Returns a reference to the file's footer, which contains metadata and layout information.
43    pub fn footer(&self) -> &Footer {
44        &self.footer
45    }
46
47    /// Returns the number of rows in the file.
48    pub fn row_count(&self) -> u64 {
49        self.footer.row_count()
50    }
51
52    /// Returns the data type of the file's contents.
53    pub fn dtype(&self) -> &DType {
54        self.footer.dtype()
55    }
56
57    /// Returns the file's statistics, if available.
58    ///
59    /// Statistics can be used for query optimization and data exploration.
60    pub fn file_stats(&self) -> Option<&Arc<[StatsSet]>> {
61        self.footer.statistics()
62    }
63
64    /// Returns a reference to the file's metrics.
65    pub fn metrics(&self) -> &VortexMetrics {
66        &self.metrics
67    }
68
69    /// Create a new segment source for reading from the file.
70    ///
71    /// This may spawn a background I/O driver that will exit when the returned segment source
72    /// is dropped.
73    pub fn segment_source(&self) -> Arc<dyn SegmentSource> {
74        self.segment_source.clone()
75    }
76
77    /// Create a new layout reader for the file.
78    pub fn layout_reader(&self) -> VortexResult<Arc<dyn LayoutReader>> {
79        let segment_source = self.segment_source();
80        self.footer
81            .layout()
82            // TODO(ngates): we may want to allow the user pass in a name here?
83            .new_reader("".into(), segment_source)
84    }
85
86    /// Initiate a scan of the file, returning a builder for configuring the scan.
87    pub fn scan(&self) -> VortexResult<ScanBuilder<ArrayRef>> {
88        Ok(ScanBuilder::new(self.layout_reader()?).with_metrics(self.metrics.clone()))
89    }
90
91    /// Returns true if the expression will never match any rows in the file.
92    pub fn can_prune(&self, filter: &ExprRef) -> VortexResult<bool> {
93        let Some((stats, fields)) = self
94            .footer
95            .statistics()
96            .zip(self.footer.dtype().as_struct_opt())
97        else {
98            return Ok(false);
99        };
100
101        let set = FieldPathSet::from_iter(fields.names().iter().zip(stats.iter()).flat_map(
102            |(name, stats)| {
103                stats.iter().map(|(stat, _)| {
104                    FieldPath::from_iter([
105                        Field::Name(name.clone()),
106                        Field::Name(stat.name().into()),
107                    ])
108                })
109            },
110        ));
111
112        let Some((predicate, required_stats)) = checked_pruning_expr(filter, &set) else {
113            return Ok(false);
114        };
115
116        let required_file_stats = HashMap::from_iter(
117            required_stats
118                .map()
119                .iter()
120                .map(|(path, stats)| (path.clone(), stats.clone())),
121        );
122
123        let Some(file_stats) =
124            extract_relevant_file_stats_as_struct_row(&required_file_stats, stats, fields)?
125        else {
126            return Ok(false);
127        };
128
129        let scope = Scope::new(file_stats);
130
131        Ok(predicate
132            .evaluate(&scope)?
133            .as_constant()
134            .is_some_and(|result| result.as_bool().value() == Some(true)))
135    }
136}