vortex_file/
open.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use flatbuffers::root;
7use vortex_array::ArrayRegistry;
8use vortex_dtype::DType;
9use vortex_error::{VortexResult, vortex_bail, vortex_err};
10use vortex_flatbuffers::{FlatBuffer, ReadFlatBuffer, dtype as fbd};
11use vortex_layout::{LayoutRegistry, LayoutRegistryExt};
12use vortex_metrics::VortexMetrics;
13
14use crate::footer::{FileStatistics, Footer, Postscript, PostscriptSegment};
15use crate::{DEFAULT_REGISTRY, EOF_SIZE, MAGIC_BYTES, VERSION};
16
17pub trait FileType: Sized {
18    type Options;
19}
20
21/// Open options for a Vortex file reader.
22pub struct VortexOpenOptions<F: FileType> {
23    /// File-specific options
24    pub(crate) options: F::Options,
25    /// The registry of array encodings.
26    pub(crate) registry: Arc<ArrayRegistry>,
27    /// The registry of layouts.
28    pub(crate) layout_registry: Arc<LayoutRegistry>,
29    /// An optional, externally provided, file size.
30    pub(crate) file_size: Option<u64>,
31    /// An optional, externally provided, DType.
32    pub(crate) dtype: Option<DType>,
33    /// An optional, externally provided, file layout.
34    // TODO(ngates): add an optional DType so we only read the layout segment.
35    pub(crate) footer: Option<Footer>,
36    /// A metrics registry for the file.
37    pub(crate) metrics: VortexMetrics,
38}
39
40impl<F: FileType> VortexOpenOptions<F> {
41    /// Create a new [`VortexOpenOptions`] with the expected options for the file source.
42    ///
43    /// This should not be used directly, instead public API clients are expected to
44    /// access either `VortexOpenOptions::file()` or `VortexOpenOptions::memory()`
45    pub(crate) fn new(options: F::Options) -> Self {
46        Self {
47            options,
48            registry: DEFAULT_REGISTRY.clone(),
49            layout_registry: Arc::new(LayoutRegistry::default()),
50            file_size: None,
51            dtype: None,
52            footer: None,
53            metrics: VortexMetrics::default(),
54        }
55    }
56
57    /// Configure a Vortex array registry.
58    pub fn with_array_registry(mut self, registry: Arc<ArrayRegistry>) -> Self {
59        self.registry = registry;
60        self
61    }
62
63    /// Configure a Vortex array registry.
64    pub fn with_layout_registry(mut self, registry: Arc<LayoutRegistry>) -> Self {
65        self.layout_registry = registry;
66        self
67    }
68
69    /// Configure a known file size.
70    ///
71    /// This helps to prevent an I/O request to discover the size of the file.
72    /// Of course, all bets are off if you pass an incorrect value.
73    pub fn with_file_size(mut self, file_size: u64) -> Self {
74        self.file_size = Some(file_size);
75        self
76    }
77
78    /// Configure a known DType.
79    ///
80    /// If this is provided, then the Vortex file may be opened with fewer I/O requests.
81    ///
82    /// For Vortex files that do not contain a `DType`, this is required.
83    pub fn with_dtype(mut self, dtype: DType) -> Self {
84        self.dtype = Some(dtype);
85        self
86    }
87
88    /// Configure a known file layout.
89    ///
90    /// If this is provided, then the Vortex file can be opened without performing any I/O.
91    /// Once open, the [`Footer`] can be accessed via [`crate::VortexFile::footer`].
92    pub fn with_footer(mut self, footer: Footer) -> Self {
93        self.dtype = Some(footer.layout().dtype().clone());
94        self.footer = Some(footer);
95        self
96    }
97
98    /// Configure a custom [`VortexMetrics`].
99    pub fn with_metrics(mut self, metrics: VortexMetrics) -> Self {
100        self.metrics = metrics;
101        self
102    }
103}
104
105impl<F: FileType> VortexOpenOptions<F> {
106    /// Parse the postscript from the initial read.
107    pub(crate) fn parse_postscript(&self, initial_read: &[u8]) -> VortexResult<Postscript> {
108        if initial_read.len() < EOF_SIZE {
109            vortex_bail!(
110                "Initial read must be at least EOF_SIZE ({}) bytes",
111                EOF_SIZE
112            );
113        }
114        let eof_loc = initial_read.len() - EOF_SIZE;
115        let magic_bytes_loc = eof_loc + (EOF_SIZE - MAGIC_BYTES.len());
116
117        let magic_number = &initial_read[magic_bytes_loc..];
118        if magic_number != MAGIC_BYTES {
119            vortex_bail!("Malformed file, invalid magic bytes, got {magic_number:?}")
120        }
121
122        let version = u16::from_le_bytes(
123            initial_read[eof_loc..eof_loc + 2]
124                .try_into()
125                .map_err(|e| vortex_err!("Version was not a u16 {e}"))?,
126        );
127        if version != VERSION {
128            vortex_bail!("Malformed file, unsupported version {version}")
129        }
130
131        let ps_size = u16::from_le_bytes(
132            initial_read[eof_loc + 2..eof_loc + 4]
133                .try_into()
134                .map_err(|e| vortex_err!("Postscript size was not a u16 {e}"))?,
135        ) as usize;
136
137        if initial_read.len() < ps_size + EOF_SIZE {
138            vortex_bail!(
139                "Initial read must be at least {} bytes to include the Postscript",
140                ps_size + EOF_SIZE
141            );
142        }
143
144        Postscript::read_flatbuffer_bytes(&initial_read[eof_loc - ps_size..eof_loc])
145    }
146
147    /// Parse the DType from the initial read.
148    pub(crate) fn parse_dtype(
149        &self,
150        initial_offset: u64,
151        initial_read: &[u8],
152        segment: &PostscriptSegment,
153    ) -> VortexResult<DType> {
154        let offset = usize::try_from(segment.offset - initial_offset)?;
155        let sliced_buffer =
156            FlatBuffer::copy_from(&initial_read[offset..offset + (segment.length as usize)]);
157        let fbd_dtype = root::<fbd::DType>(&sliced_buffer)?;
158
159        DType::try_from_view(fbd_dtype, sliced_buffer.clone())
160    }
161
162    /// Parse the [`FileStatistics`] from the initial read buffer.
163    pub(crate) fn parse_file_statistics(
164        &self,
165        initial_offset: u64,
166        initial_read: &[u8],
167        segment: &PostscriptSegment,
168    ) -> VortexResult<FileStatistics> {
169        let offset = usize::try_from(segment.offset - initial_offset)?;
170        let sliced_buffer =
171            FlatBuffer::copy_from(&initial_read[offset..offset + (segment.length as usize)]);
172        FileStatistics::read_flatbuffer_bytes(&sliced_buffer)
173    }
174
175    /// Parse the rest of the footer from the initial read.
176    pub(crate) fn parse_footer(
177        &self,
178        initial_offset: u64,
179        initial_read: &[u8],
180        footer_segment: &PostscriptSegment,
181        layout_segment: &PostscriptSegment,
182        dtype: DType,
183        file_stats: Option<FileStatistics>,
184    ) -> VortexResult<Footer> {
185        let footer_offset = usize::try_from(footer_segment.offset - initial_offset)?;
186        let footer_bytes = FlatBuffer::copy_from(
187            &initial_read[footer_offset..footer_offset + (footer_segment.length as usize)],
188        );
189
190        let layout_offset = usize::try_from(layout_segment.offset - initial_offset)?;
191        let layout_bytes = FlatBuffer::copy_from(
192            &initial_read[layout_offset..layout_offset + (layout_segment.length as usize)],
193        );
194
195        Footer::from_flatbuffer(
196            footer_bytes,
197            layout_bytes,
198            dtype,
199            file_stats,
200            &self.registry,
201            &self.layout_registry,
202        )
203    }
204}