dsq_formats/
lib.rs

1//! dsq-formats: File format support for dsq
2//!
3//! This crate provides comprehensive support for reading and writing various
4//! structured data formats including CSV, Parquet, JSON, and more.
5//!
6//! # Features
7//!
8//! - **Format Detection**: Automatic format detection from file extensions and content
9//! - **Unified Interface**: Consistent reader/writer traits across all formats
10//! - **Performance**: Optimized implementations using Polars DataFrames
11//! - **Extensibility**: Easy to add new formats with macro-based boilerplate reduction
12//!
13//! # Supported Formats
14//!
15//! ## Input Formats
16//! - **CSV** (`.csv`) - Comma-separated values with customizable options
17//! - **TSV** (`.tsv`) - Tab-separated values
18//! - **Parquet** (`.parquet`) - Columnar storage with compression
19//! - **JSON** (`.json`) - Standard JSON arrays and objects
20//! - **JSON Lines** (`.jsonl`, `.ndjson`) - Newline-delimited JSON
21//! - **Arrow** (`.arrow`) - Apache Arrow IPC format
22//! - **Avro** (`.avro`) - Apache Avro serialization
23//!
24//! ## Output Formats
25//! All input formats plus:
26//! - **Excel** (`.xlsx`) - Microsoft Excel format
27//! - **ORC** (`.orc`) - Optimized Row Columnar format
28//!
29//! # Architecture
30//!
31//! The format system is built around:
32//! - [`DataFormat`] - Enum representing all supported formats
33//! - [`DataReader`] / [`DataWriter`] - Traits for reading/writing data
34//! - Format-specific implementations with consistent option structs
35//! - Macros to reduce boilerplate for new format implementations
36
37// Re-export shared types
38pub use dsq_shared::{BuildInfo, VERSION};
39
40// Core modules
41/// Error types and result handling
42pub mod error;
43/// File format detection and metadata
44pub mod format;
45
46// Format implementations
47/// ADT (ASCII Delimited Text) format reading and writing
48pub mod adt;
49/// CSV format reading and writing
50pub mod csv;
51/// JSON format reading and writing
52pub mod json;
53/// Parquet format reading and writing
54pub mod parquet;
55
56// Generic reader/writer interfaces
57/// Generic data reader interface
58pub mod reader;
59/// Generic data writer interface
60pub mod writer;
61/// Old writer implementation (for testing)
62// Re-export main types for convenience
63pub use error::{Error, FormatError, Result};
64pub use format::{detect_format_from_content, DataFormat, FormatOptions};
65#[cfg(any(
66    feature = "csv",
67    feature = "json",
68    feature = "parquet",
69    feature = "avro"
70))]
71pub use reader::{
72    from_memory, from_path, from_path_with_format, DataReader, FileReader, MemoryReader,
73};
74pub use reader::{FormatReadOptions, ReadOptions};
75#[cfg(any(
76    feature = "csv",
77    feature = "json",
78    feature = "parquet",
79    feature = "avro"
80))]
81pub use writer::{to_memory, to_path, to_path_with_format, DataWriter, FileWriter, MemoryWriter};
82pub use writer::{
83    AvroCompression, CompressionLevel, CsvEncoding, FormatWriteOptions, OrcCompression,
84    WriteOptions,
85};
86
87#[cfg(feature = "parquet")]
88pub use writer::ParquetCompression;
89
90// Deserialize/serialize functions
91#[cfg(feature = "csv")]
92pub use reader::deserialize_csv;
93#[cfg(feature = "json")]
94pub use reader::deserialize_json;
95#[cfg(any(
96    feature = "csv",
97    feature = "json",
98    feature = "parquet",
99    feature = "avro"
100))]
101pub use reader::{deserialize, deserialize_adt, from_csv, from_json};
102
103#[cfg(feature = "parquet")]
104pub use reader::deserialize_parquet;
105
106#[cfg(feature = "csv")]
107pub use writer::serialize_csv;
108#[cfg(feature = "json")]
109pub use writer::serialize_json;
110#[cfg(any(
111    feature = "csv",
112    feature = "json",
113    feature = "parquet",
114    feature = "avro"
115))]
116pub use writer::{serialize, serialize_adt};
117
118#[cfg(feature = "parquet")]
119pub use writer::serialize_parquet;
120
121#[cfg(feature = "avro")]
122pub use writer::serialize_avro;
123
124// Format-specific re-exports
125#[cfg(feature = "csv")]
126pub use csv::{
127    detect_csv_format, read_csv_file, read_csv_file_with_options, write_csv_file,
128    write_csv_file_with_options, CsvReader, CsvWriteOptions, CsvWriter,
129    DsqCsvReadOptions as CsvReadOptions,
130};
131
132#[cfg(feature = "json")]
133pub use json::{
134    detect_json_format, read_json_file, read_json_file_with_options, read_jsonl_file,
135    write_json_file, write_json_file_with_options, write_jsonl_file, JsonReadOptions, JsonReader,
136    JsonWriteOptions, JsonWriter,
137};
138
139#[cfg(feature = "parquet")]
140pub use parquet::{
141    detect_parquet_format, read_parquet_file, read_parquet_file_lazy,
142    read_parquet_file_lazy_with_options, read_parquet_file_with_options, write_parquet_file,
143    write_parquet_file_with_options, ParquetReadOptions, ParquetReader, ParquetWriteOptions,
144    ParquetWriter,
145};
146
147#[cfg(any(
148    feature = "csv",
149    feature = "json",
150    feature = "parquet",
151    feature = "avro"
152))]
153pub use adt::{detect_adt_format, AdtReadOptions, AdtWriteOptions};
154
155/// Build information for dsq-formats
156pub const BUILD_INFO: BuildInfo = BuildInfo {
157    version: VERSION,
158    git_hash: option_env!("VERGEN_GIT_SHA"),
159    build_date: option_env!("VERGEN_BUILD_TIMESTAMP"),
160    rust_version: option_env!("VERGEN_RUSTC_SEMVER"),
161    features: &[
162        #[cfg(feature = "csv")]
163        "csv",
164        #[cfg(feature = "json")]
165        "json",
166        #[cfg(feature = "parquet")]
167        "parquet",
168        #[cfg(feature = "avro")]
169        "avro",
170    ],
171};
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_build_info() {
179        assert_eq!(BUILD_INFO.version, VERSION);
180        // Features array should contain enabled features
181        let features = BUILD_INFO.features;
182        #[cfg(feature = "csv")]
183        assert!(features.contains(&"csv"));
184        #[cfg(feature = "json")]
185        assert!(features.contains(&"json"));
186        #[cfg(feature = "parquet")]
187        assert!(features.contains(&"parquet"));
188        #[cfg(feature = "avro")]
189        assert!(features.contains(&"avro"));
190    }
191
192    #[test]
193    fn test_re_exports() {
194        // Test that main types are re-exported correctly
195        let _error: Error = Error::operation("test");
196        let _format: DataFormat = DataFormat::Csv;
197        let _options: ReadOptions = ReadOptions::default();
198        let _write_options: WriteOptions = WriteOptions::default();
199
200        // Test format-specific re-exports if features are enabled
201        #[cfg(feature = "csv")]
202        {
203            let _csv_options: CsvReadOptions = CsvReadOptions::default();
204            let _csv_write_options: CsvWriteOptions = CsvWriteOptions::default();
205        }
206
207        #[cfg(feature = "json")]
208        {
209            let _json_options: JsonReadOptions = JsonReadOptions::default();
210            let _json_write_options: JsonWriteOptions = JsonWriteOptions::default();
211        }
212
213        #[cfg(feature = "parquet")]
214        {
215            let _parquet_options: ParquetReadOptions = ParquetReadOptions::default();
216            let _parquet_write_options: ParquetWriteOptions = ParquetWriteOptions::default();
217        }
218    }
219
220    #[test]
221    fn test_format_detection_re_export() {
222        // Test that detect_format_from_content is re-exported
223        let json_data = b"{\"test\": \"data\"}";
224        let result = detect_format_from_content(json_data);
225        assert_eq!(result, Some(DataFormat::Json));
226    }
227
228    #[test]
229    #[cfg(any(
230        feature = "csv",
231        feature = "json",
232        feature = "parquet",
233        feature = "avro"
234    ))]
235    fn test_reader_writer_functions_re_export() {
236        // Test that reader/writer functions are re-exported
237        let reader = from_path("nonexistent.csv");
238        assert!(reader.is_ok());
239        // Reading should fail for nonexistent file
240        let mut reader = reader.unwrap();
241        let result = reader.read(&ReadOptions::default());
242        assert!(result.is_err());
243
244        let mut reader = from_path_with_format("nonexistent.csv", DataFormat::Csv);
245        // from_path_with_format returns FileReader directly
246        let result = reader.read(&ReadOptions::default());
247        assert!(result.is_err());
248
249        let data = vec![];
250        let reader = from_memory(data, DataFormat::Csv);
251        // from_memory returns MemoryReader directly
252        let _reader = reader;
253
254        let result = to_path("nonexistent.csv");
255        assert!(result.is_ok()); // Should succeed for valid extension
256
257        let result = to_path_with_format("nonexistent.csv", DataFormat::Csv);
258        // to_path_with_format returns FileWriter directly, file creation happens on write
259        let _writer = result;
260
261        let result = to_memory(DataFormat::Csv);
262        // to_memory now returns a MemoryWriter, not a Result
263        let _writer = result;
264    }
265
266    #[test]
267    fn test_format_options_re_export() {
268        let _format_options: FormatOptions = FormatOptions::default();
269    }
270}