dsq_formats/
format.rs

1use crate::error::{Error, FormatError, Result};
2use std::path::Path;
3use std::str::FromStr;
4
5/// Supported data formats for reading and writing
6#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
7#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
8pub enum DataFormat {
9    /// Comma-separated values
10    Csv,
11    /// Tab-separated values
12    Tsv,
13    /// ASCII Delimited Text (using ASCII control characters 28-31)
14    #[cfg_attr(feature = "cli", value(name = "adt", alias = "ascii-delimited"))]
15    Adt,
16    /// Apache Parquet columnar format
17    Parquet,
18    /// Apache Avro row-based format
19    Avro,
20    /// JSON Lines (newline-delimited JSON)
21    #[cfg_attr(
22        feature = "cli",
23        value(name = "json-lines", alias = "jsonl", alias = "ndjson")
24    )]
25    JsonLines,
26    /// Apache Arrow format
27    Arrow,
28    /// Standard JSON (array of objects)
29    Json,
30    /// Compact JSON (no pretty printing)
31    #[cfg_attr(feature = "cli", value(name = "jsonc", alias = "json-compact"))]
32    JsonCompact,
33    /// Microsoft Excel format (output only)
34    Excel,
35    /// Apache ORC columnar format (output only)
36    Orc,
37}
38
39impl DataFormat {
40    /// Detect format from file extension
41    pub fn from_path(path: &Path) -> Result<Self> {
42        let ext = path.extension().and_then(|e| e.to_str()).ok_or_else(|| {
43            Error::Format(FormatError::DetectionFailed(path.display().to_string()))
44        })?;
45
46        Self::from_extension(ext)
47    }
48
49    /// Detect format from file extension string
50    pub fn from_extension(ext: &str) -> Result<Self> {
51        match ext.to_lowercase().as_str() {
52            "csv" => Ok(Self::Csv),
53            "tsv" => Ok(Self::Tsv),
54            "adt" => Ok(Self::Adt),
55            "parquet" => Ok(Self::Parquet),
56            "avro" => Ok(Self::Avro),
57            "jsonl" | "ndjson" => Ok(Self::JsonLines),
58            "arrow" => Ok(Self::Arrow),
59            "json" => Ok(Self::Json),
60            "jsonc" => Ok(Self::JsonCompact),
61            "xlsx" => Ok(Self::Excel),
62            "orc" => Ok(Self::Orc),
63            _ => Err(Error::Format(FormatError::Unknown(ext.to_string()))),
64        }
65    }
66
67    /// Parse format from string (for CLI arguments)
68    pub fn parse(s: &str) -> Result<Self> {
69        match s.to_lowercase().as_str() {
70            "csv" => Ok(Self::Csv),
71            "tsv" => Ok(Self::Tsv),
72            "adt" | "ascii-delimited" => Ok(Self::Adt),
73            "parquet" => Ok(Self::Parquet),
74            "avro" => Ok(Self::Avro),
75            "jsonl" | "json-lines" | "ndjson" => Ok(Self::JsonLines),
76            "arrow" => Ok(Self::Arrow),
77            "json" => Ok(Self::Json),
78            "jsonc" | "json-compact" => Ok(Self::JsonCompact),
79            "excel" | "xlsx" => Ok(Self::Excel),
80            "orc" => Ok(Self::Orc),
81            _ => Err(Error::Format(FormatError::Unknown(s.to_string()))),
82        }
83    }
84
85    /// Get the default file extension for this format
86    pub fn default_extension(&self) -> &'static str {
87        match self {
88            Self::Csv => "csv",
89            Self::Tsv => "tsv",
90            Self::Adt => "adt",
91            Self::Parquet => "parquet",
92            Self::Avro => "avro",
93            Self::JsonLines => "jsonl",
94            Self::Arrow => "arrow",
95            Self::Json => "json",
96            Self::JsonCompact => "jsonc",
97            Self::Excel => "xlsx",
98            Self::Orc => "orc",
99        }
100    }
101
102    /// Check if format supports reading
103    pub fn supports_reading(&self) -> bool {
104        match self {
105            Self::Csv
106            | Self::Tsv
107            | Self::Adt
108            | Self::Parquet
109            | Self::Avro
110            | Self::JsonLines
111            | Self::Arrow
112            | Self::Json
113            | Self::JsonCompact => true,
114            Self::Excel | Self::Orc => false,
115        }
116    }
117
118    /// Check if format supports writing
119    pub fn supports_writing(&self) -> bool {
120        true // All formats support writing
121    }
122
123    /// Check if format supports lazy reading
124    pub fn supports_lazy_reading(&self) -> bool {
125        match self {
126            Self::Csv | Self::Adt | Self::Parquet | Self::JsonLines => true,
127            Self::Tsv
128            | Self::Avro
129            | Self::Arrow
130            | Self::Json
131            | Self::JsonCompact
132            | Self::Excel
133            | Self::Orc => false,
134        }
135    }
136
137    /// Check if format supports streaming
138    pub fn supports_streaming(&self) -> bool {
139        match self {
140            Self::Csv | Self::Tsv | Self::Adt | Self::JsonLines => true,
141            Self::Parquet
142            | Self::Avro
143            | Self::Arrow
144            | Self::Json
145            | Self::JsonCompact
146            | Self::Excel
147            | Self::Orc => false,
148        }
149    }
150
151    /// Get human-readable format name
152    pub fn display_name(&self) -> &'static str {
153        match self {
154            Self::Csv => "CSV",
155            Self::Tsv => "TSV",
156            Self::Adt => "ASCII Delimited Text",
157            Self::Parquet => "Parquet",
158            Self::Avro => "Avro",
159            Self::JsonLines => "JSON Lines",
160            Self::Arrow => "Arrow",
161            Self::Json => "JSON",
162            Self::JsonCompact => "JSON Compact",
163            Self::Excel => "Excel",
164            Self::Orc => "ORC",
165        }
166    }
167}
168
169impl std::fmt::Display for DataFormat {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        write!(f, "{}", self.display_name())
172    }
173}
174
175impl FromStr for DataFormat {
176    type Err = String;
177
178    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
179        Self::parse(s).map_err(|e| e.to_string())
180    }
181}
182
183/// Options for format detection
184#[derive(Debug, Clone)]
185pub struct FormatOptions {
186    /// Whether to use file content for detection if extension fails
187    pub content_detection: bool,
188    /// Maximum bytes to read for content detection
189    pub detection_bytes: usize,
190}
191
192impl Default for FormatOptions {
193    fn default() -> Self {
194        Self {
195            content_detection: true,
196            detection_bytes: 8192,
197        }
198    }
199}
200
201/// Detect format from file content (magic bytes)
202pub fn detect_format_from_content(bytes: &[u8]) -> Option<DataFormat> {
203    // Handle empty input
204    if bytes.is_empty() {
205        return Some(DataFormat::Csv);
206    }
207
208    // Parquet magic bytes: "PAR1" at start and end
209    if bytes.len() >= 4 && &bytes[0..4] == b"PAR1" {
210        return Some(DataFormat::Parquet);
211    }
212
213    // Avro magic bytes: "Obj\x01"
214    if bytes.len() >= 4 && &bytes[0..4] == b"Obj\x01" {
215        return Some(DataFormat::Avro);
216    }
217
218    // Arrow magic bytes: "ARROW1\x00\x00"
219    if bytes.len() >= 8 && &bytes[0..6] == b"ARROW1" {
220        return Some(DataFormat::Arrow);
221    }
222
223    // ORC magic bytes: "ORC"
224    if bytes.len() >= 3 && &bytes[0..3] == b"ORC" {
225        return Some(DataFormat::Orc);
226    }
227
228    // Try to detect text-based formats
229    if let Ok(text) = std::str::from_utf8(bytes) {
230        // Try to detect JSON formats first
231        if serde_json::from_str::<serde_json::Value>(text).is_ok() {
232            return Some(DataFormat::Json);
233        }
234
235        // Check for JsonLines format (each line is a JSON value)
236        let lines: Vec<&str> = text.lines().take(5).collect();
237        if !lines.is_empty() {
238            let mut valid_json_lines = 0;
239            let mut total_lines = 0;
240            for line in &lines {
241                let line = line.trim();
242                if !line.is_empty() {
243                    total_lines += 1;
244                    if serde_json::from_str::<serde_json::Value>(line).is_ok() {
245                        valid_json_lines += 1;
246                    }
247                }
248            }
249            if valid_json_lines == total_lines && total_lines > 0 {
250                return Some(DataFormat::JsonLines);
251            }
252        }
253
254        // Try to detect CSV/TSV by counting delimiters in first few lines
255        let lines: Vec<&str> = text.lines().take(5).collect();
256        if lines.len() >= 2 {
257            let comma_counts: Vec<usize> =
258                lines.iter().map(|line| line.matches(',').count()).collect();
259            let tab_counts: Vec<usize> = lines
260                .iter()
261                .map(|line| line.matches('\t').count())
262                .collect();
263
264            // Check consistency of delimiter counts
265            let comma_consistent = comma_counts.windows(2).all(|w| w[0] == w[1] && w[0] > 0);
266            let tab_consistent = tab_counts.windows(2).all(|w| w[0] == w[1] && w[0] > 0);
267
268            if tab_consistent && (!comma_consistent || tab_counts[0] > comma_counts[0]) {
269                return Some(DataFormat::Tsv);
270            } else if comma_consistent {
271                return Some(DataFormat::Csv);
272            }
273        }
274    }
275
276    None
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_format_from_extension() {
285        assert_eq!(DataFormat::from_extension("csv").unwrap(), DataFormat::Csv);
286        assert_eq!(DataFormat::from_extension("CSV").unwrap(), DataFormat::Csv);
287        assert_eq!(DataFormat::from_extension("tsv").unwrap(), DataFormat::Tsv);
288        assert_eq!(DataFormat::from_extension("adt").unwrap(), DataFormat::Adt);
289        assert_eq!(
290            DataFormat::from_extension("parquet").unwrap(),
291            DataFormat::Parquet
292        );
293        assert_eq!(
294            DataFormat::from_extension("avro").unwrap(),
295            DataFormat::Avro
296        );
297        assert_eq!(
298            DataFormat::from_extension("jsonl").unwrap(),
299            DataFormat::JsonLines
300        );
301        assert_eq!(
302            DataFormat::from_extension("ndjson").unwrap(),
303            DataFormat::JsonLines
304        );
305        assert_eq!(
306            DataFormat::from_extension("arrow").unwrap(),
307            DataFormat::Arrow
308        );
309        assert_eq!(
310            DataFormat::from_extension("json").unwrap(),
311            DataFormat::Json
312        );
313        assert_eq!(
314            DataFormat::from_extension("jsonc").unwrap(),
315            DataFormat::JsonCompact
316        );
317        assert_eq!(
318            DataFormat::from_extension("xlsx").unwrap(),
319            DataFormat::Excel
320        );
321        assert_eq!(DataFormat::from_extension("orc").unwrap(), DataFormat::Orc);
322        assert!(DataFormat::from_extension("unknown").is_err());
323    }
324
325    #[test]
326    fn test_format_from_str() {
327        assert_eq!(DataFormat::from_str("csv").unwrap(), DataFormat::Csv);
328        assert_eq!(DataFormat::from_str("tsv").unwrap(), DataFormat::Tsv);
329        assert_eq!(DataFormat::from_str("adt").unwrap(), DataFormat::Adt);
330        assert_eq!(
331            DataFormat::from_str("ascii-delimited").unwrap(),
332            DataFormat::Adt
333        );
334        assert_eq!(
335            DataFormat::from_str("parquet").unwrap(),
336            DataFormat::Parquet
337        );
338        assert_eq!(DataFormat::from_str("avro").unwrap(), DataFormat::Avro);
339        assert_eq!(
340            DataFormat::from_str("jsonl").unwrap(),
341            DataFormat::JsonLines
342        );
343        assert_eq!(
344            DataFormat::from_str("json-lines").unwrap(),
345            DataFormat::JsonLines
346        );
347        assert_eq!(
348            DataFormat::from_str("ndjson").unwrap(),
349            DataFormat::JsonLines
350        );
351        assert_eq!(DataFormat::from_str("arrow").unwrap(), DataFormat::Arrow);
352        assert_eq!(DataFormat::from_str("json").unwrap(), DataFormat::Json);
353        assert_eq!(
354            DataFormat::from_str("jsonc").unwrap(),
355            DataFormat::JsonCompact
356        );
357        assert_eq!(
358            DataFormat::from_str("json-compact").unwrap(),
359            DataFormat::JsonCompact
360        );
361        assert_eq!(DataFormat::from_str("excel").unwrap(), DataFormat::Excel);
362        assert_eq!(DataFormat::from_str("xlsx").unwrap(), DataFormat::Excel);
363        assert_eq!(DataFormat::from_str("orc").unwrap(), DataFormat::Orc);
364        assert!(DataFormat::from_str("invalid").is_err());
365    }
366
367    #[test]
368    fn test_format_capabilities() {
369        // Test reading support
370        assert!(DataFormat::Csv.supports_reading());
371        assert!(DataFormat::Tsv.supports_reading());
372        assert!(DataFormat::Adt.supports_reading());
373        assert!(DataFormat::Parquet.supports_reading());
374        assert!(DataFormat::Avro.supports_reading());
375        assert!(DataFormat::JsonLines.supports_reading());
376        assert!(DataFormat::Arrow.supports_reading());
377        assert!(DataFormat::Json.supports_reading());
378        assert!(DataFormat::JsonCompact.supports_reading());
379        assert!(!DataFormat::Excel.supports_reading());
380        assert!(!DataFormat::Orc.supports_reading());
381
382        // Test writing support (all should support)
383        assert!(DataFormat::Csv.supports_writing());
384        assert!(DataFormat::Tsv.supports_writing());
385        assert!(DataFormat::Adt.supports_writing());
386        assert!(DataFormat::Parquet.supports_writing());
387        assert!(DataFormat::Avro.supports_writing());
388        assert!(DataFormat::JsonLines.supports_writing());
389        assert!(DataFormat::Arrow.supports_writing());
390        assert!(DataFormat::Json.supports_writing());
391        assert!(DataFormat::JsonCompact.supports_writing());
392        assert!(DataFormat::Excel.supports_writing());
393        assert!(DataFormat::Orc.supports_writing());
394
395        // Test lazy reading support
396        assert!(DataFormat::Csv.supports_lazy_reading());
397        assert!(!DataFormat::Tsv.supports_lazy_reading());
398        assert!(DataFormat::Adt.supports_lazy_reading());
399        assert!(DataFormat::Parquet.supports_lazy_reading());
400        assert!(!DataFormat::Avro.supports_lazy_reading());
401        assert!(DataFormat::JsonLines.supports_lazy_reading());
402        assert!(!DataFormat::Arrow.supports_lazy_reading());
403        assert!(!DataFormat::Json.supports_lazy_reading());
404        assert!(!DataFormat::JsonCompact.supports_lazy_reading());
405        assert!(!DataFormat::Excel.supports_lazy_reading());
406        assert!(!DataFormat::Orc.supports_lazy_reading());
407
408        // Test streaming support
409        assert!(DataFormat::Csv.supports_streaming());
410        assert!(DataFormat::Tsv.supports_streaming());
411        assert!(DataFormat::Adt.supports_streaming());
412        assert!(!DataFormat::Parquet.supports_streaming());
413        assert!(!DataFormat::Avro.supports_streaming());
414        assert!(DataFormat::JsonLines.supports_streaming());
415        assert!(!DataFormat::Arrow.supports_streaming());
416        assert!(!DataFormat::Json.supports_streaming());
417        assert!(!DataFormat::JsonCompact.supports_streaming());
418        assert!(!DataFormat::Excel.supports_streaming());
419        assert!(!DataFormat::Orc.supports_streaming());
420    }
421
422    #[test]
423    fn test_content_detection() {
424        assert_eq!(
425            detect_format_from_content(b"PAR1"),
426            Some(DataFormat::Parquet)
427        );
428        assert_eq!(
429            detect_format_from_content(b"Obj\x01"),
430            Some(DataFormat::Avro)
431        );
432        assert_eq!(
433            detect_format_from_content(b"ARROW1\x00\x00"),
434            Some(DataFormat::Arrow)
435        );
436        assert_eq!(detect_format_from_content(b"ORC"), Some(DataFormat::Orc));
437
438        assert_eq!(
439            detect_format_from_content(b"[{\"a\": 1}]"),
440            Some(DataFormat::Json)
441        );
442        assert_eq!(
443            detect_format_from_content(b"{\"a\": 1}"),
444            Some(DataFormat::Json)
445        );
446        assert_eq!(
447            detect_format_from_content(b"{\"a\": 1}\n{\"b\": 2}"),
448            Some(DataFormat::JsonLines)
449        );
450
451        assert_eq!(
452            detect_format_from_content(b"a,b,c\n1,2,3\n4,5,6"),
453            Some(DataFormat::Csv)
454        );
455        assert_eq!(
456            detect_format_from_content(b"a\tb\tc\n1\t2\t3\n4\t5\t6"),
457            Some(DataFormat::Tsv)
458        );
459
460        assert_eq!(detect_format_from_content(b"random data"), None);
461    }
462
463    #[test]
464    fn test_from_path() {
465        use std::path::Path;
466        assert_eq!(
467            DataFormat::from_path(Path::new("file.csv")).unwrap(),
468            DataFormat::Csv
469        );
470        assert_eq!(
471            DataFormat::from_path(Path::new("file.CSV")).unwrap(),
472            DataFormat::Csv
473        );
474        assert_eq!(
475            DataFormat::from_path(Path::new("file.tsv")).unwrap(),
476            DataFormat::Tsv
477        );
478        assert_eq!(
479            DataFormat::from_path(Path::new("file.parquet")).unwrap(),
480            DataFormat::Parquet
481        );
482        assert_eq!(
483            DataFormat::from_path(Path::new("file.jsonl")).unwrap(),
484            DataFormat::JsonLines
485        );
486        assert_eq!(
487            DataFormat::from_path(Path::new("file.json")).unwrap(),
488            DataFormat::Json
489        );
490        assert_eq!(
491            DataFormat::from_path(Path::new("file.xlsx")).unwrap(),
492            DataFormat::Excel
493        );
494        assert!(DataFormat::from_path(Path::new("file")).is_err());
495        assert!(DataFormat::from_path(Path::new("file.unknown")).is_err());
496    }
497
498    #[test]
499    fn test_default_extension() {
500        assert_eq!(DataFormat::Csv.default_extension(), "csv");
501        assert_eq!(DataFormat::Tsv.default_extension(), "tsv");
502        assert_eq!(DataFormat::Adt.default_extension(), "adt");
503        assert_eq!(DataFormat::Parquet.default_extension(), "parquet");
504        assert_eq!(DataFormat::Avro.default_extension(), "avro");
505        assert_eq!(DataFormat::JsonLines.default_extension(), "jsonl");
506        assert_eq!(DataFormat::Arrow.default_extension(), "arrow");
507        assert_eq!(DataFormat::Json.default_extension(), "json");
508        assert_eq!(DataFormat::JsonCompact.default_extension(), "jsonc");
509        assert_eq!(DataFormat::Excel.default_extension(), "xlsx");
510        assert_eq!(DataFormat::Orc.default_extension(), "orc");
511    }
512
513    #[test]
514    fn test_display_name() {
515        assert_eq!(DataFormat::Csv.display_name(), "CSV");
516        assert_eq!(DataFormat::Tsv.display_name(), "TSV");
517        assert_eq!(DataFormat::Adt.display_name(), "ASCII Delimited Text");
518        assert_eq!(DataFormat::Parquet.display_name(), "Parquet");
519        assert_eq!(DataFormat::Avro.display_name(), "Avro");
520        assert_eq!(DataFormat::JsonLines.display_name(), "JSON Lines");
521        assert_eq!(DataFormat::Arrow.display_name(), "Arrow");
522        assert_eq!(DataFormat::Json.display_name(), "JSON");
523        assert_eq!(DataFormat::JsonCompact.display_name(), "JSON Compact");
524        assert_eq!(DataFormat::Excel.display_name(), "Excel");
525        assert_eq!(DataFormat::Orc.display_name(), "ORC");
526    }
527
528    #[test]
529    fn test_display_trait() {
530        assert_eq!(format!("{}", DataFormat::Csv), "CSV");
531        assert_eq!(format!("{}", DataFormat::JsonLines), "JSON Lines");
532        assert_eq!(format!("{}", DataFormat::Excel), "Excel");
533    }
534
535    #[test]
536    fn test_from_str_trait() {
537        assert_eq!("csv".parse::<DataFormat>().unwrap(), DataFormat::Csv);
538        assert_eq!(
539            "json-lines".parse::<DataFormat>().unwrap(),
540            DataFormat::JsonLines
541        );
542        assert_eq!("excel".parse::<DataFormat>().unwrap(), DataFormat::Excel);
543        assert!("invalid".parse::<DataFormat>().is_err());
544    }
545
546    #[test]
547    fn test_format_options_default() {
548        let opts = FormatOptions::default();
549        assert!(opts.content_detection);
550        assert_eq!(opts.detection_bytes, 8192);
551    }
552}