datafusion_datasource_orc/
options.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! ORC-specific configuration types.
19//!
20//! This module mirrors the configuration style used by DataFusion's Parquet
21//! datasource. It currently focuses on read-path options; write options are
22//! reserved for future work.
23
24use std::collections::HashMap;
25
26use datafusion_common::{DataFusionError, Result};
27
28/// Options that control how ORC files are read.
29#[derive(Clone, Debug)]
30pub struct OrcReadOptions {
31    /// Optional batch size override used when a scan does not specify one.
32    pub batch_size: Option<usize>,
33    /// Enable converting predicates into ORC stripe-level filters.
34    pub pushdown_predicate: bool,
35    /// Optional metadata size hint for ORC footer reads.
36    pub metadata_size_hint: Option<usize>,
37}
38
39impl Default for OrcReadOptions {
40    fn default() -> Self {
41        Self {
42            batch_size: None,
43            pushdown_predicate: true,
44            metadata_size_hint: None,
45        }
46    }
47}
48
49impl OrcReadOptions {
50    /// Set a default batch size for ORC scans.
51    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
52        self.batch_size = Some(batch_size);
53        self
54    }
55
56    /// Enable or disable predicate pushdown into the ORC reader.
57    pub fn with_pushdown_predicate(mut self, pushdown_predicate: bool) -> Self {
58        self.pushdown_predicate = pushdown_predicate;
59        self
60    }
61
62    /// Provide a hint for how many bytes to read when fetching ORC metadata.
63    pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self {
64        self.metadata_size_hint = Some(metadata_size_hint);
65        self
66    }
67}
68
69/// Top-level ORC format options.
70#[derive(Clone, Debug, Default)]
71pub struct OrcFormatOptions {
72    /// Read-path configuration.
73    pub read: OrcReadOptions,
74    // TODO: Add write options when ORC writer support lands.
75}
76
77impl OrcFormatOptions {
78    /// Apply a set of key-value options from a DataFusion format options map.
79    pub fn apply_format_options(&mut self, format_options: &HashMap<String, String>) -> Result<()> {
80        for (key, value) in format_options {
81            match key.as_str() {
82                "orc.batch_size" => {
83                    self.read.batch_size = Some(parse_usize_option(key, value)?);
84                }
85                "orc.pushdown_predicate" => {
86                    self.read.pushdown_predicate = parse_bool_option(key, value)?;
87                }
88                "orc.metadata_size_hint" => {
89                    self.read.metadata_size_hint = Some(parse_usize_option(key, value)?);
90                }
91                _ => {
92                    // TODO: Validate unknown keys once ORC options are formalized in DataFusion.
93                }
94            }
95        }
96        Ok(())
97    }
98}
99
100fn parse_bool_option(key: &str, value: &str) -> Result<bool> {
101    value.parse::<bool>().map_err(|_| {
102        DataFusionError::Configuration(format!(
103            "Invalid value for {key}: {value}. Expected true or false."
104        ))
105    })
106}
107
108fn parse_usize_option(key: &str, value: &str) -> Result<usize> {
109    value.parse::<usize>().map_err(|_| {
110        DataFusionError::Configuration(format!(
111            "Invalid value for {key}: {value}. Expected a positive integer."
112        ))
113    })
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119
120    #[test]
121    fn test_orc_read_options_default() {
122        let options = OrcReadOptions::default();
123        assert_eq!(options.batch_size, None);
124        assert!(options.pushdown_predicate);
125        assert_eq!(options.metadata_size_hint, None);
126    }
127
128    #[test]
129    fn test_orc_read_options_builder() {
130        let options = OrcReadOptions::default()
131            .with_batch_size(4096)
132            .with_pushdown_predicate(false)
133            .with_metadata_size_hint(1024);
134
135        assert_eq!(options.batch_size, Some(4096));
136        assert!(!options.pushdown_predicate);
137        assert_eq!(options.metadata_size_hint, Some(1024));
138    }
139
140    #[test]
141    fn test_orc_format_options_default() {
142        let options = OrcFormatOptions::default();
143        assert_eq!(options.read.batch_size, None);
144        assert!(options.read.pushdown_predicate);
145    }
146
147    #[test]
148    fn test_apply_format_options_batch_size() {
149        let mut options = OrcFormatOptions::default();
150        let mut format_opts = HashMap::new();
151        format_opts.insert("orc.batch_size".to_string(), "8192".to_string());
152
153        options.apply_format_options(&format_opts).unwrap();
154        assert_eq!(options.read.batch_size, Some(8192));
155    }
156
157    #[test]
158    fn test_apply_format_options_pushdown_predicate() {
159        let mut options = OrcFormatOptions::default();
160        let mut format_opts = HashMap::new();
161        format_opts.insert("orc.pushdown_predicate".to_string(), "false".to_string());
162
163        options.apply_format_options(&format_opts).unwrap();
164        assert!(!options.read.pushdown_predicate);
165    }
166
167    #[test]
168    fn test_apply_format_options_metadata_size_hint() {
169        let mut options = OrcFormatOptions::default();
170        let mut format_opts = HashMap::new();
171        format_opts.insert("orc.metadata_size_hint".to_string(), "1048576".to_string());
172
173        options.apply_format_options(&format_opts).unwrap();
174        assert_eq!(options.read.metadata_size_hint, Some(1048576));
175    }
176
177    #[test]
178    fn test_apply_format_options_invalid_batch_size() {
179        let mut options = OrcFormatOptions::default();
180        let mut format_opts = HashMap::new();
181        format_opts.insert("orc.batch_size".to_string(), "not_a_number".to_string());
182
183        let result = options.apply_format_options(&format_opts);
184        assert!(result.is_err());
185        let err = result.unwrap_err().to_string();
186        assert!(err.contains("Invalid value for orc.batch_size"));
187    }
188
189    #[test]
190    fn test_apply_format_options_invalid_bool() {
191        let mut options = OrcFormatOptions::default();
192        let mut format_opts = HashMap::new();
193        format_opts.insert("orc.pushdown_predicate".to_string(), "maybe".to_string());
194
195        let result = options.apply_format_options(&format_opts);
196        assert!(result.is_err());
197        let err = result.unwrap_err().to_string();
198        assert!(err.contains("Invalid value for orc.pushdown_predicate"));
199    }
200
201    #[test]
202    fn test_apply_format_options_multiple() {
203        let mut options = OrcFormatOptions::default();
204        let mut format_opts = HashMap::new();
205        format_opts.insert("orc.batch_size".to_string(), "16384".to_string());
206        format_opts.insert("orc.pushdown_predicate".to_string(), "true".to_string());
207        format_opts.insert("orc.metadata_size_hint".to_string(), "2097152".to_string());
208
209        options.apply_format_options(&format_opts).unwrap();
210        assert_eq!(options.read.batch_size, Some(16384));
211        assert!(options.read.pushdown_predicate);
212        assert_eq!(options.read.metadata_size_hint, Some(2097152));
213    }
214
215    #[test]
216    fn test_apply_format_options_unknown_key() {
217        let mut options = OrcFormatOptions::default();
218        let mut format_opts = HashMap::new();
219        format_opts.insert("orc.unknown_option".to_string(), "value".to_string());
220
221        // Unknown options should be silently ignored for now
222        let result = options.apply_format_options(&format_opts);
223        assert!(result.is_ok());
224    }
225}