vortex_file/
open.rs

1use std::sync::Arc;
2
3use flatbuffers::root;
4use vortex_array::ArrayRegistry;
5use vortex_dtype::DType;
6use vortex_error::{VortexResult, vortex_bail, vortex_err};
7use vortex_flatbuffers::{FlatBuffer, ReadFlatBuffer, dtype as fbd};
8use vortex_layout::{LayoutRegistry, LayoutRegistryExt};
9use vortex_metrics::VortexMetrics;
10
11use crate::footer::{FileStatistics, Footer, Postscript, PostscriptSegment};
12use crate::{DEFAULT_REGISTRY, EOF_SIZE, MAGIC_BYTES, VERSION};
13
14pub trait FileType: Sized {
15    type Options;
16}
17
18/// Open options for a Vortex file reader.
19pub struct VortexOpenOptions<F: FileType> {
20    /// File-specific options
21    pub(crate) options: F::Options,
22    /// The registry of array encodings.
23    pub(crate) registry: Arc<ArrayRegistry>,
24    /// The registry of layouts.
25    pub(crate) layout_registry: Arc<LayoutRegistry>,
26    /// An optional, externally provided, file size.
27    pub(crate) file_size: Option<u64>,
28    /// An optional, externally provided, DType.
29    pub(crate) dtype: Option<DType>,
30    /// An optional, externally provided, file layout.
31    // TODO(ngates): add an optional DType so we only read the layout segment.
32    pub(crate) footer: Option<Footer>,
33    /// A metrics registry for the file.
34    pub(crate) metrics: VortexMetrics,
35}
36
37impl<F: FileType> VortexOpenOptions<F> {
38    pub(crate) fn new(options: F::Options) -> Self {
39        Self {
40            options,
41            registry: DEFAULT_REGISTRY.clone(),
42            layout_registry: Arc::new(LayoutRegistry::default()),
43            file_size: None,
44            dtype: None,
45            footer: None,
46            metrics: VortexMetrics::default(),
47        }
48    }
49
50    /// Configure a Vortex array registry.
51    pub fn with_array_registry(mut self, registry: Arc<ArrayRegistry>) -> Self {
52        self.registry = registry;
53        self
54    }
55
56    /// Configure a Vortex array registry.
57    pub fn with_layout_registry(mut self, registry: Arc<LayoutRegistry>) -> Self {
58        self.layout_registry = registry;
59        self
60    }
61
62    /// Configure a known file size.
63    ///
64    /// This helps to prevent an I/O request to discover the size of the file.
65    /// Of course, all bets are off if you pass an incorrect value.
66    pub fn with_file_size(mut self, file_size: u64) -> Self {
67        self.file_size = Some(file_size);
68        self
69    }
70
71    /// Configure a known DType.
72    ///
73    /// If this is provided, then the Vortex file may be opened with fewer I/O requests.
74    ///
75    /// For Vortex files that do not contain a `DType`, this is required.
76    pub fn with_dtype(mut self, dtype: DType) -> Self {
77        self.dtype = Some(dtype);
78        self
79    }
80
81    /// Configure a known file layout.
82    ///
83    /// If this is provided, then the Vortex file can be opened without performing any I/O.
84    /// Once open, the [`Footer`] can be accessed via [`crate::VortexFile::footer`].
85    pub fn with_footer(mut self, footer: Footer) -> Self {
86        self.dtype = Some(footer.layout().dtype().clone());
87        self.footer = Some(footer);
88        self
89    }
90
91    /// Configure a custom [`VortexMetrics`].
92    pub fn with_metrics(mut self, metrics: VortexMetrics) -> Self {
93        self.metrics = metrics;
94        self
95    }
96}
97
98impl<F: FileType> VortexOpenOptions<F> {
99    /// Parse the postscript from the initial read.
100    pub(crate) fn parse_postscript(&self, initial_read: &[u8]) -> VortexResult<Postscript> {
101        if initial_read.len() < EOF_SIZE {
102            vortex_bail!(
103                "Initial read must be at least EOF_SIZE ({}) bytes",
104                EOF_SIZE
105            );
106        }
107        let eof_loc = initial_read.len() - EOF_SIZE;
108        let magic_bytes_loc = eof_loc + (EOF_SIZE - MAGIC_BYTES.len());
109
110        let magic_number = &initial_read[magic_bytes_loc..];
111        if magic_number != MAGIC_BYTES {
112            vortex_bail!("Malformed file, invalid magic bytes, got {magic_number:?}")
113        }
114
115        let version = u16::from_le_bytes(
116            initial_read[eof_loc..eof_loc + 2]
117                .try_into()
118                .map_err(|e| vortex_err!("Version was not a u16 {e}"))?,
119        );
120        if version != VERSION {
121            vortex_bail!("Malformed file, unsupported version {version}")
122        }
123
124        let ps_size = u16::from_le_bytes(
125            initial_read[eof_loc + 2..eof_loc + 4]
126                .try_into()
127                .map_err(|e| vortex_err!("Postscript size was not a u16 {e}"))?,
128        ) as usize;
129
130        if initial_read.len() < ps_size + EOF_SIZE {
131            vortex_bail!(
132                "Initial read must be at least {} bytes to include the Postscript",
133                ps_size + EOF_SIZE
134            );
135        }
136
137        Postscript::read_flatbuffer_bytes(&initial_read[eof_loc - ps_size..eof_loc])
138    }
139
140    /// Parse the DType from the initial read.
141    pub(crate) fn parse_dtype(
142        &self,
143        initial_offset: u64,
144        initial_read: &[u8],
145        segment: &PostscriptSegment,
146    ) -> VortexResult<DType> {
147        let offset = usize::try_from(segment.offset - initial_offset)?;
148        let sliced_buffer =
149            FlatBuffer::copy_from(&initial_read[offset..offset + (segment.length as usize)]);
150        let fbd_dtype = root::<fbd::DType>(&sliced_buffer)?;
151
152        DType::try_from_view(fbd_dtype, sliced_buffer.clone())
153    }
154
155    /// Parse the [`FileStatistics`] from the initial read buffer.
156    pub(crate) fn parse_file_statistics(
157        &self,
158        initial_offset: u64,
159        initial_read: &[u8],
160        segment: &PostscriptSegment,
161    ) -> VortexResult<FileStatistics> {
162        let offset = usize::try_from(segment.offset - initial_offset)?;
163        let sliced_buffer =
164            FlatBuffer::copy_from(&initial_read[offset..offset + (segment.length as usize)]);
165        FileStatistics::read_flatbuffer_bytes(&sliced_buffer)
166    }
167
168    /// Parse the rest of the footer from the initial read.
169    pub(crate) fn parse_footer(
170        &self,
171        initial_offset: u64,
172        initial_read: &[u8],
173        footer_segment: &PostscriptSegment,
174        layout_segment: &PostscriptSegment,
175        dtype: DType,
176        file_stats: Option<FileStatistics>,
177    ) -> VortexResult<Footer> {
178        let footer_offset = usize::try_from(footer_segment.offset - initial_offset)?;
179        let footer_bytes = FlatBuffer::copy_from(
180            &initial_read[footer_offset..footer_offset + (footer_segment.length as usize)],
181        );
182
183        let layout_offset = usize::try_from(layout_segment.offset - initial_offset)?;
184        let layout_bytes = FlatBuffer::copy_from(
185            &initial_read[layout_offset..layout_offset + (layout_segment.length as usize)],
186        );
187
188        Footer::from_flatbuffer(
189            footer_bytes,
190            layout_bytes,
191            dtype,
192            file_stats,
193            &self.registry,
194            &self.layout_registry,
195        )
196    }
197}