datafusion_datasource_orc/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! ORC datasource for Apache DataFusion.
19//!
20//! This crate provides DataFusion [`FileFormat`] and [`FileSource`] implementations
21//! backed by [`orc-rust`]. It integrates with DataFusion's listing tables and
22//! reads ORC files asynchronously via [`object_store`].
23//!
24//! # Features
25//!
26//! - **Schema Inference**: Automatically infer table schema from ORC files
27//! - **Statistics Extraction**: Extract file statistics (row count, file size)
28//! - **Projection Pushdown**: Read only the columns needed by the query
29//! - **Limit Pushdown**: Stop reading after the required number of rows
30//! - **Predicate Pushdown**: Filter data at stripe level using ORC row indexes
31//! - **Multi-file Support**: Read from multiple ORC files with schema merging
32//! - **Async I/O**: Fully async reading via `object_store`
33//!
34//! # Quick Start
35//!
36//! ## Using with DataFusion SessionContext
37//!
38//! ```rust,ignore
39//! use datafusion::prelude::*;
40//! use datafusion::datasource::listing::{
41//!     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
42//! };
43//! use datafusion_datasource_orc::OrcFormat;
44//! use std::sync::Arc;
45//!
46//! #[tokio::main]
47//! async fn main() -> datafusion_common::Result<()> {
48//!     // Create a SessionContext
49//!     let ctx = SessionContext::new();
50//!
51//!     // Configure listing options with ORC format
52//!     let listing_options = ListingOptions::new(Arc::new(OrcFormat::new()))
53//!         .with_file_extension(".orc");
54//!
55//!     // Create a listing table URL
56//!     let table_path = ListingTableUrl::parse("file:///path/to/orc/files/")?;
57//!
58//!     // Infer schema from the ORC files
59//!     let schema = listing_options
60//!         .infer_schema(&ctx.state(), &table_path)
61//!         .await?;
62//!
63//!     // Create and register the table
64//!     let config = ListingTableConfig::new(table_path)
65//!         .with_listing_options(listing_options)
66//!         .with_schema(schema);
67//!     let table = ListingTable::try_new(config)?;
68//!     ctx.register_table("my_orc_table", Arc::new(table))?;
69//!
70//!     // Query the table
71//!     let df = ctx.sql("SELECT * FROM my_orc_table WHERE id > 100").await?;
72//!     df.show().await?;
73//!
74//!     Ok(())
75//! }
76//! ```
77//!
78//! ## Configuring Read Options
79//!
80//! ```rust
81//! use datafusion_datasource_orc::{OrcFormat, OrcFormatOptions, OrcReadOptions};
82//!
83//! // Create read options
84//! let read_options = OrcReadOptions::default()
85//!     .with_batch_size(16384)           // Custom batch size
86//!     .with_pushdown_predicate(true)    // Enable predicate pushdown
87//!     .with_metadata_size_hint(1048576); // 1MB metadata hint
88//!
89//! // Create format with options
90//! let format_options = OrcFormatOptions { read: read_options };
91//! let format = OrcFormat::new().with_options(format_options);
92//! ```
93//!
94//! # Architecture
95//!
96//! This crate follows DataFusion's file format abstraction:
97//!
98//! ```text
99//! ┌─────────────────────┐
100//! │   OrcFormatFactory  │  Creates OrcFormat instances
101//! └──────────┬──────────┘
102//!            │
103//! ┌──────────▼──────────┐
104//! │      OrcFormat      │  FileFormat trait implementation
105//! └──────────┬──────────┘
106//!            │ create_physical_plan()
107//! ┌──────────▼──────────┐
108//! │      OrcSource      │  FileSource trait implementation
109//! └──────────┬──────────┘
110//!            │ create_file_opener()
111//! ┌──────────▼──────────┐
112//! │      OrcOpener      │  Opens files and creates streams
113//! └──────────┬──────────┘
114//!            │
115//! ┌──────────▼──────────┐
116//! │ ObjectStoreChunkReader │  Adapts object_store to orc-rust
117//! └──────────┬──────────┘
118//!            │
119//! ┌──────────▼──────────┐
120//! │  orc-rust Reader    │  ORC file parsing
121//! └─────────────────────┘
122//! ```
123//!
124//! # Predicate Pushdown
125//!
126//! When a filter predicate is provided, this crate converts supported DataFusion
127//! expressions to `orc-rust` predicates for stripe-level filtering:
128//!
129//! - Comparison operators: `=`, `!=`, `<`, `<=`, `>`, `>=`
130//! - Logical operators: `AND`, `OR`, `NOT`
131//! - Null checks: `IS NULL`, `IS NOT NULL`
132//!
133//! Unsupported predicates are gracefully ignored (no error), and filtering
134//! falls back to DataFusion's row-level evaluation.
135//!
136//! # Supported Data Types
137//!
138//! The following ORC types are supported via `orc-rust`:
139//!
140//! | ORC Type | Arrow Type |
141//! |----------|------------|
142//! | BOOLEAN | Boolean |
143//! | BYTE | Int8 |
144//! | SHORT | Int16 |
145//! | INT | Int32 |
146//! | LONG | Int64 |
147//! | FLOAT | Float32 |
148//! | DOUBLE | Float64 |
149//! | STRING | Utf8 |
150//! | BINARY | Binary |
151//! | DECIMAL | Decimal128 |
152//! | DATE | Date32 |
153//! | TIMESTAMP | Timestamp |
154//! | LIST | List |
155//! | MAP | Map |
156//! | STRUCT | Struct |
157//!
158//! [`FileFormat`]: datafusion_datasource::file_format::FileFormat
159//! [`FileSource`]: datafusion_datasource::file::FileSource
160//! [`orc-rust`]: https://github.com/datafusion-contrib/orc-rust
161//! [`object_store`]: https://docs.rs/object_store
162
163#![deny(missing_docs)]
164#![deny(rustdoc::broken_intra_doc_links)]
165
166pub mod file_format;
167pub mod metadata;
168pub mod metrics;
169pub mod options;
170pub mod source;
171
172mod opener;
173mod predicate;
174mod reader;
175mod writer;
176
177// Re-export main types
178pub use file_format::{OrcFormat, OrcFormatFactory};
179pub use metrics::OrcFileMetrics;
180pub use options::{OrcFormatOptions, OrcReadOptions};
181pub use reader::ObjectStoreChunkReader;
182pub use source::OrcSource;