vortex_file/
file.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module defines the [`VortexFile`] struct, which represents a Vortex file on disk or in memory.
5//!
6//! The `VortexFile` provides methods for accessing file metadata, creating segment sources for reading
7//! data from the file, and initiating scans to read the file's contents into memory as Vortex arrays.
8
9use std::ops::Range;
10use std::sync::Arc;
11
12use itertools::Itertools;
13use vortex_array::ArrayRef;
14use vortex_array::stats::StatsSet;
15use vortex_dtype::{DType, Field, FieldMask, FieldPath, FieldPathSet};
16use vortex_error::VortexResult;
17use vortex_expr::pruning::checked_pruning_expr;
18use vortex_expr::{ExprRef, Scope};
19use vortex_layout::LayoutReader;
20use vortex_layout::segments::SegmentSource;
21use vortex_metrics::VortexMetrics;
22use vortex_scan::{ScanBuilder, SplitBy};
23use vortex_utils::aliases::hash_map::HashMap;
24
25use crate::footer::Footer;
26use crate::pruning::extract_relevant_file_stats_as_struct_row;
27
28/// Represents a Vortex file, providing access to its metadata and content.
29///
30/// A `VortexFile` is created by opening a Vortex file using [`VortexOpenOptions`](crate::VortexOpenOptions).
31/// It provides methods for accessing file metadata (such as row count, data type, and statistics)
32/// and for initiating scans to read the file's contents.
33#[derive(Clone)]
34pub struct VortexFile {
35    /// The footer of the Vortex file, containing metadata and layout information.
36    pub(crate) footer: Footer,
37    /// The segment source used to read segments from this file.
38    pub(crate) segment_source: Arc<dyn SegmentSource>,
39    /// Metrics tied to the file.
40    pub(crate) metrics: VortexMetrics,
41}
42
43impl VortexFile {
44    /// Returns a reference to the file's footer, which contains metadata and layout information.
45    pub fn footer(&self) -> &Footer {
46        &self.footer
47    }
48
49    /// Returns the number of rows in the file.
50    pub fn row_count(&self) -> u64 {
51        self.footer.row_count()
52    }
53
54    /// Returns the data type of the file's contents.
55    pub fn dtype(&self) -> &DType {
56        self.footer.dtype()
57    }
58
59    /// Returns the file's statistics, if available.
60    ///
61    /// Statistics can be used for query optimization and data exploration.
62    pub fn file_stats(&self) -> Option<&Arc<[StatsSet]>> {
63        self.footer.statistics()
64    }
65
66    /// Returns a reference to the file's metrics.
67    pub fn metrics(&self) -> &VortexMetrics {
68        &self.metrics
69    }
70
71    /// Create a new segment source for reading from the file.
72    ///
73    /// This may spawn a background I/O driver that will exit when the returned segment source
74    /// is dropped.
75    pub fn segment_source(&self) -> Arc<dyn SegmentSource> {
76        self.segment_source.clone()
77    }
78
79    /// Create a new layout reader for the file.
80    pub fn layout_reader(&self) -> VortexResult<Arc<dyn LayoutReader>> {
81        let segment_source = self.segment_source();
82        self.footer
83            .layout()
84            // TODO(ngates): we may want to allow the user pass in a name here?
85            .new_reader("".into(), segment_source)
86    }
87
88    /// Initiate a scan of the file, returning a builder for configuring the scan.
89    pub fn scan(&self) -> VortexResult<ScanBuilder<ArrayRef>> {
90        Ok(ScanBuilder::new(self.layout_reader()?).with_metrics(self.metrics.clone()))
91    }
92
93    /// Returns true if the expression will never match any rows in the file.
94    pub fn can_prune(&self, filter: &ExprRef) -> VortexResult<bool> {
95        let Some((stats, fields)) = self
96            .footer
97            .statistics()
98            .zip(self.footer.dtype().as_struct_fields_opt())
99        else {
100            return Ok(false);
101        };
102
103        let set = FieldPathSet::from_iter(fields.names().iter().zip(stats.iter()).flat_map(
104            |(name, stats)| {
105                stats.iter().map(|(stat, _)| {
106                    FieldPath::from_iter([
107                        Field::Name(name.clone()),
108                        Field::Name(stat.name().into()),
109                    ])
110                })
111            },
112        ));
113
114        let Some((predicate, required_stats)) = checked_pruning_expr(filter, &set) else {
115            return Ok(false);
116        };
117
118        let required_file_stats = HashMap::from_iter(
119            required_stats
120                .map()
121                .iter()
122                .map(|(path, stats)| (path.clone(), stats.clone())),
123        );
124
125        let Some(file_stats) =
126            extract_relevant_file_stats_as_struct_row(&required_file_stats, stats, fields)?
127        else {
128            return Ok(false);
129        };
130
131        let scope = Scope::new(file_stats);
132
133        Ok(predicate
134            .evaluate(&scope)?
135            .as_constant()
136            .is_some_and(|result| result.as_bool().value() == Some(true)))
137    }
138
139    pub fn splits(&self) -> VortexResult<Vec<Range<u64>>> {
140        Ok(SplitBy::Layout
141            .splits(self.layout_reader()?.as_ref(), &[FieldMask::All])?
142            .into_iter()
143            .tuple_windows()
144            .map(|(start, end)| start..end)
145            .collect())
146    }
147}