vortex_file/
file.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module defines the [`VortexFile`] struct, which represents a Vortex file on disk or in memory.
5//!
6//! The `VortexFile` provides methods for accessing file metadata, creating segment sources for reading
7//! data from the file, and initiating scans to read the file's contents into memory as Vortex arrays.
8
9use std::ops::Range;
10use std::sync::Arc;
11
12use itertools::Itertools;
13use vortex_array::ArrayRef;
14use vortex_array::expr::Expression;
15use vortex_array::expr::pruning::checked_pruning_expr;
16use vortex_array::stats::StatsSet;
17use vortex_dtype::{DType, Field, FieldMask, FieldPath, FieldPathSet};
18use vortex_error::VortexResult;
19use vortex_layout::LayoutReader;
20use vortex_layout::segments::SegmentSource;
21use vortex_metrics::VortexMetrics;
22use vortex_scan::{ScanBuilder, SplitBy};
23use vortex_session::VortexSession;
24use vortex_utils::aliases::hash_map::HashMap;
25
26use crate::footer::Footer;
27use crate::pruning::extract_relevant_file_stats_as_struct_row;
28
29/// Represents a Vortex file, providing access to its metadata and content.
30///
31/// A `VortexFile` is created by opening a Vortex file using [`VortexOpenOptions`](crate::VortexOpenOptions).
32/// It provides methods for accessing file metadata (such as row count, data type, and statistics)
33/// and for initiating scans to read the file's contents.
34#[derive(Clone)]
35pub struct VortexFile {
36    /// The footer of the Vortex file, containing metadata and layout information.
37    pub(crate) footer: Footer,
38    /// The segment source used to read segments from this file.
39    pub(crate) segment_source: Arc<dyn SegmentSource>,
40    /// Metrics tied to the file.
41    pub(crate) metrics: VortexMetrics,
42    /// The Vortex session used to open this file
43    pub(crate) session: VortexSession,
44}
45
46impl VortexFile {
47    /// Returns a reference to the file's footer, which contains metadata and layout information.
48    pub fn footer(&self) -> &Footer {
49        &self.footer
50    }
51
52    /// Returns the number of rows in the file.
53    pub fn row_count(&self) -> u64 {
54        self.footer.row_count()
55    }
56
57    /// Returns the data type of the file's contents.
58    pub fn dtype(&self) -> &DType {
59        self.footer.dtype()
60    }
61
62    /// Returns the file's statistics, if available.
63    ///
64    /// Statistics can be used for query optimization and data exploration.
65    pub fn file_stats(&self) -> Option<&Arc<[StatsSet]>> {
66        self.footer.statistics()
67    }
68
69    /// Returns a reference to the file's metrics.
70    pub fn metrics(&self) -> &VortexMetrics {
71        &self.metrics
72    }
73
74    /// Create a new segment source for reading from the file.
75    ///
76    /// This may spawn a background I/O driver that will exit when the returned segment source
77    /// is dropped.
78    pub fn segment_source(&self) -> Arc<dyn SegmentSource> {
79        self.segment_source.clone()
80    }
81
82    /// Create a new layout reader for the file.
83    pub fn layout_reader(&self) -> VortexResult<Arc<dyn LayoutReader>> {
84        let segment_source = self.segment_source();
85        self.footer
86            .layout()
87            // TODO(ngates): we may want to allow the user pass in a name here?
88            .new_reader("".into(), segment_source)
89    }
90
91    /// Initiate a scan of the file, returning a builder for configuring the scan.
92    pub fn scan(&self) -> VortexResult<ScanBuilder<ArrayRef>> {
93        Ok(
94            ScanBuilder::new(self.session.clone(), self.layout_reader()?)
95                .with_metrics(self.metrics.clone()),
96        )
97    }
98
99    #[cfg(gpu_unstable)]
100    pub fn gpu_scan(
101        &self,
102        ctx: Arc<cudarc::driver::CudaContext>,
103    ) -> VortexResult<vortex_scan::gpu::GpuScanBuilder<vortex_gpu::GpuVector>> {
104        let segment_source = self.segment_source();
105        let gpu_reader = self
106            .footer
107            .layout()
108            .new_gpu_reader("".into(), segment_source, ctx)?;
109
110        Ok(vortex_scan::gpu::GpuScanBuilder::new(
111            self.session.clone(),
112            gpu_reader,
113        ))
114    }
115
116    /// Returns true if the expression will never match any rows in the file.
117    pub fn can_prune(&self, filter: &Expression) -> VortexResult<bool> {
118        let Some((stats, fields)) = self
119            .footer
120            .statistics()
121            .zip(self.footer.dtype().as_struct_fields_opt())
122        else {
123            return Ok(false);
124        };
125
126        let set = FieldPathSet::from_iter(fields.names().iter().zip(stats.iter()).flat_map(
127            |(name, stats)| {
128                stats.iter().map(|(stat, _)| {
129                    FieldPath::from_iter([
130                        Field::Name(name.clone()),
131                        Field::Name(stat.name().into()),
132                    ])
133                })
134            },
135        ));
136
137        let Some((predicate, required_stats)) = checked_pruning_expr(filter, &set) else {
138            return Ok(false);
139        };
140
141        let required_file_stats = HashMap::from_iter(
142            required_stats
143                .map()
144                .iter()
145                .map(|(path, stats)| (path.clone(), stats.clone())),
146        );
147
148        let Some(file_stats) =
149            extract_relevant_file_stats_as_struct_row(&required_file_stats, stats, fields)?
150        else {
151            return Ok(false);
152        };
153
154        Ok(predicate
155            .evaluate(&file_stats)?
156            .as_constant()
157            .is_some_and(|result| result.as_bool().value() == Some(true)))
158    }
159
160    pub fn splits(&self) -> VortexResult<Vec<Range<u64>>> {
161        let reader = self.layout_reader()?;
162        Ok(SplitBy::Layout
163            .splits(reader.as_ref(), &(0..reader.row_count()), &[FieldMask::All])?
164            .into_iter()
165            .tuple_windows()
166            .map(|(start, end)| start..end)
167            .collect())
168    }
169}