1use crate::error::{Error, FormatError, Result};
2use std::path::Path;
3use std::str::FromStr;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
7#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
8pub enum DataFormat {
9 Csv,
11 Tsv,
13 #[cfg_attr(feature = "cli", value(name = "adt", alias = "ascii-delimited"))]
15 Adt,
16 Parquet,
18 Avro,
20 #[cfg_attr(
22 feature = "cli",
23 value(name = "json-lines", alias = "jsonl", alias = "ndjson")
24 )]
25 JsonLines,
26 Arrow,
28 Json,
30 #[cfg_attr(feature = "cli", value(name = "jsonc", alias = "json-compact"))]
32 JsonCompact,
33 Excel,
35 Orc,
37}
38
39impl DataFormat {
40 pub fn from_path(path: &Path) -> Result<Self> {
42 let ext = path.extension().and_then(|e| e.to_str()).ok_or_else(|| {
43 Error::Format(FormatError::DetectionFailed(path.display().to_string()))
44 })?;
45
46 Self::from_extension(ext)
47 }
48
49 pub fn from_extension(ext: &str) -> Result<Self> {
51 match ext.to_lowercase().as_str() {
52 "csv" => Ok(Self::Csv),
53 "tsv" => Ok(Self::Tsv),
54 "adt" => Ok(Self::Adt),
55 "parquet" => Ok(Self::Parquet),
56 "avro" => Ok(Self::Avro),
57 "jsonl" | "ndjson" => Ok(Self::JsonLines),
58 "arrow" => Ok(Self::Arrow),
59 "json" => Ok(Self::Json),
60 "jsonc" => Ok(Self::JsonCompact),
61 "xlsx" => Ok(Self::Excel),
62 "orc" => Ok(Self::Orc),
63 _ => Err(Error::Format(FormatError::Unknown(ext.to_string()))),
64 }
65 }
66
67 pub fn parse(s: &str) -> Result<Self> {
69 match s.to_lowercase().as_str() {
70 "csv" => Ok(Self::Csv),
71 "tsv" => Ok(Self::Tsv),
72 "adt" | "ascii-delimited" => Ok(Self::Adt),
73 "parquet" => Ok(Self::Parquet),
74 "avro" => Ok(Self::Avro),
75 "jsonl" | "json-lines" | "ndjson" => Ok(Self::JsonLines),
76 "arrow" => Ok(Self::Arrow),
77 "json" => Ok(Self::Json),
78 "jsonc" | "json-compact" => Ok(Self::JsonCompact),
79 "excel" | "xlsx" => Ok(Self::Excel),
80 "orc" => Ok(Self::Orc),
81 _ => Err(Error::Format(FormatError::Unknown(s.to_string()))),
82 }
83 }
84
85 pub fn default_extension(&self) -> &'static str {
87 match self {
88 Self::Csv => "csv",
89 Self::Tsv => "tsv",
90 Self::Adt => "adt",
91 Self::Parquet => "parquet",
92 Self::Avro => "avro",
93 Self::JsonLines => "jsonl",
94 Self::Arrow => "arrow",
95 Self::Json => "json",
96 Self::JsonCompact => "jsonc",
97 Self::Excel => "xlsx",
98 Self::Orc => "orc",
99 }
100 }
101
102 pub fn supports_reading(&self) -> bool {
104 match self {
105 Self::Csv
106 | Self::Tsv
107 | Self::Adt
108 | Self::Parquet
109 | Self::Avro
110 | Self::JsonLines
111 | Self::Arrow
112 | Self::Json
113 | Self::JsonCompact => true,
114 Self::Excel | Self::Orc => false,
115 }
116 }
117
118 pub fn supports_writing(&self) -> bool {
120 true }
122
123 pub fn supports_lazy_reading(&self) -> bool {
125 match self {
126 Self::Csv | Self::Adt | Self::Parquet | Self::JsonLines => true,
127 Self::Tsv
128 | Self::Avro
129 | Self::Arrow
130 | Self::Json
131 | Self::JsonCompact
132 | Self::Excel
133 | Self::Orc => false,
134 }
135 }
136
137 pub fn supports_streaming(&self) -> bool {
139 match self {
140 Self::Csv | Self::Tsv | Self::Adt | Self::JsonLines => true,
141 Self::Parquet
142 | Self::Avro
143 | Self::Arrow
144 | Self::Json
145 | Self::JsonCompact
146 | Self::Excel
147 | Self::Orc => false,
148 }
149 }
150
151 pub fn display_name(&self) -> &'static str {
153 match self {
154 Self::Csv => "CSV",
155 Self::Tsv => "TSV",
156 Self::Adt => "ASCII Delimited Text",
157 Self::Parquet => "Parquet",
158 Self::Avro => "Avro",
159 Self::JsonLines => "JSON Lines",
160 Self::Arrow => "Arrow",
161 Self::Json => "JSON",
162 Self::JsonCompact => "JSON Compact",
163 Self::Excel => "Excel",
164 Self::Orc => "ORC",
165 }
166 }
167}
168
169impl std::fmt::Display for DataFormat {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 write!(f, "{}", self.display_name())
172 }
173}
174
175impl FromStr for DataFormat {
176 type Err = String;
177
178 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
179 Self::parse(s).map_err(|e| e.to_string())
180 }
181}
182
183#[derive(Debug, Clone)]
185pub struct FormatOptions {
186 pub content_detection: bool,
188 pub detection_bytes: usize,
190}
191
192impl Default for FormatOptions {
193 fn default() -> Self {
194 Self {
195 content_detection: true,
196 detection_bytes: 8192,
197 }
198 }
199}
200
201pub fn detect_format_from_content(bytes: &[u8]) -> Option<DataFormat> {
203 if bytes.is_empty() {
205 return Some(DataFormat::Csv);
206 }
207
208 if bytes.len() >= 4 && &bytes[0..4] == b"PAR1" {
210 return Some(DataFormat::Parquet);
211 }
212
213 if bytes.len() >= 4 && &bytes[0..4] == b"Obj\x01" {
215 return Some(DataFormat::Avro);
216 }
217
218 if bytes.len() >= 8 && &bytes[0..6] == b"ARROW1" {
220 return Some(DataFormat::Arrow);
221 }
222
223 if bytes.len() >= 3 && &bytes[0..3] == b"ORC" {
225 return Some(DataFormat::Orc);
226 }
227
228 if let Ok(text) = std::str::from_utf8(bytes) {
230 if serde_json::from_str::<serde_json::Value>(text).is_ok() {
232 return Some(DataFormat::Json);
233 }
234
235 let lines: Vec<&str> = text.lines().take(5).collect();
237 if !lines.is_empty() {
238 let mut valid_json_lines = 0;
239 let mut total_lines = 0;
240 for line in &lines {
241 let line = line.trim();
242 if !line.is_empty() {
243 total_lines += 1;
244 if serde_json::from_str::<serde_json::Value>(line).is_ok() {
245 valid_json_lines += 1;
246 }
247 }
248 }
249 if valid_json_lines == total_lines && total_lines > 0 {
250 return Some(DataFormat::JsonLines);
251 }
252 }
253
254 let lines: Vec<&str> = text.lines().take(5).collect();
256 if lines.len() >= 2 {
257 let comma_counts: Vec<usize> =
258 lines.iter().map(|line| line.matches(',').count()).collect();
259 let tab_counts: Vec<usize> = lines
260 .iter()
261 .map(|line| line.matches('\t').count())
262 .collect();
263
264 let comma_consistent = comma_counts.windows(2).all(|w| w[0] == w[1] && w[0] > 0);
266 let tab_consistent = tab_counts.windows(2).all(|w| w[0] == w[1] && w[0] > 0);
267
268 if tab_consistent && (!comma_consistent || tab_counts[0] > comma_counts[0]) {
269 return Some(DataFormat::Tsv);
270 } else if comma_consistent {
271 return Some(DataFormat::Csv);
272 }
273 }
274 }
275
276 None
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 #[test]
284 fn test_format_from_extension() {
285 assert_eq!(DataFormat::from_extension("csv").unwrap(), DataFormat::Csv);
286 assert_eq!(DataFormat::from_extension("CSV").unwrap(), DataFormat::Csv);
287 assert_eq!(DataFormat::from_extension("tsv").unwrap(), DataFormat::Tsv);
288 assert_eq!(DataFormat::from_extension("adt").unwrap(), DataFormat::Adt);
289 assert_eq!(
290 DataFormat::from_extension("parquet").unwrap(),
291 DataFormat::Parquet
292 );
293 assert_eq!(
294 DataFormat::from_extension("avro").unwrap(),
295 DataFormat::Avro
296 );
297 assert_eq!(
298 DataFormat::from_extension("jsonl").unwrap(),
299 DataFormat::JsonLines
300 );
301 assert_eq!(
302 DataFormat::from_extension("ndjson").unwrap(),
303 DataFormat::JsonLines
304 );
305 assert_eq!(
306 DataFormat::from_extension("arrow").unwrap(),
307 DataFormat::Arrow
308 );
309 assert_eq!(
310 DataFormat::from_extension("json").unwrap(),
311 DataFormat::Json
312 );
313 assert_eq!(
314 DataFormat::from_extension("jsonc").unwrap(),
315 DataFormat::JsonCompact
316 );
317 assert_eq!(
318 DataFormat::from_extension("xlsx").unwrap(),
319 DataFormat::Excel
320 );
321 assert_eq!(DataFormat::from_extension("orc").unwrap(), DataFormat::Orc);
322 assert!(DataFormat::from_extension("unknown").is_err());
323 }
324
325 #[test]
326 fn test_format_from_str() {
327 assert_eq!(DataFormat::from_str("csv").unwrap(), DataFormat::Csv);
328 assert_eq!(DataFormat::from_str("tsv").unwrap(), DataFormat::Tsv);
329 assert_eq!(DataFormat::from_str("adt").unwrap(), DataFormat::Adt);
330 assert_eq!(
331 DataFormat::from_str("ascii-delimited").unwrap(),
332 DataFormat::Adt
333 );
334 assert_eq!(
335 DataFormat::from_str("parquet").unwrap(),
336 DataFormat::Parquet
337 );
338 assert_eq!(DataFormat::from_str("avro").unwrap(), DataFormat::Avro);
339 assert_eq!(
340 DataFormat::from_str("jsonl").unwrap(),
341 DataFormat::JsonLines
342 );
343 assert_eq!(
344 DataFormat::from_str("json-lines").unwrap(),
345 DataFormat::JsonLines
346 );
347 assert_eq!(
348 DataFormat::from_str("ndjson").unwrap(),
349 DataFormat::JsonLines
350 );
351 assert_eq!(DataFormat::from_str("arrow").unwrap(), DataFormat::Arrow);
352 assert_eq!(DataFormat::from_str("json").unwrap(), DataFormat::Json);
353 assert_eq!(
354 DataFormat::from_str("jsonc").unwrap(),
355 DataFormat::JsonCompact
356 );
357 assert_eq!(
358 DataFormat::from_str("json-compact").unwrap(),
359 DataFormat::JsonCompact
360 );
361 assert_eq!(DataFormat::from_str("excel").unwrap(), DataFormat::Excel);
362 assert_eq!(DataFormat::from_str("xlsx").unwrap(), DataFormat::Excel);
363 assert_eq!(DataFormat::from_str("orc").unwrap(), DataFormat::Orc);
364 assert!(DataFormat::from_str("invalid").is_err());
365 }
366
367 #[test]
368 fn test_format_capabilities() {
369 assert!(DataFormat::Csv.supports_reading());
371 assert!(DataFormat::Tsv.supports_reading());
372 assert!(DataFormat::Adt.supports_reading());
373 assert!(DataFormat::Parquet.supports_reading());
374 assert!(DataFormat::Avro.supports_reading());
375 assert!(DataFormat::JsonLines.supports_reading());
376 assert!(DataFormat::Arrow.supports_reading());
377 assert!(DataFormat::Json.supports_reading());
378 assert!(DataFormat::JsonCompact.supports_reading());
379 assert!(!DataFormat::Excel.supports_reading());
380 assert!(!DataFormat::Orc.supports_reading());
381
382 assert!(DataFormat::Csv.supports_writing());
384 assert!(DataFormat::Tsv.supports_writing());
385 assert!(DataFormat::Adt.supports_writing());
386 assert!(DataFormat::Parquet.supports_writing());
387 assert!(DataFormat::Avro.supports_writing());
388 assert!(DataFormat::JsonLines.supports_writing());
389 assert!(DataFormat::Arrow.supports_writing());
390 assert!(DataFormat::Json.supports_writing());
391 assert!(DataFormat::JsonCompact.supports_writing());
392 assert!(DataFormat::Excel.supports_writing());
393 assert!(DataFormat::Orc.supports_writing());
394
395 assert!(DataFormat::Csv.supports_lazy_reading());
397 assert!(!DataFormat::Tsv.supports_lazy_reading());
398 assert!(DataFormat::Adt.supports_lazy_reading());
399 assert!(DataFormat::Parquet.supports_lazy_reading());
400 assert!(!DataFormat::Avro.supports_lazy_reading());
401 assert!(DataFormat::JsonLines.supports_lazy_reading());
402 assert!(!DataFormat::Arrow.supports_lazy_reading());
403 assert!(!DataFormat::Json.supports_lazy_reading());
404 assert!(!DataFormat::JsonCompact.supports_lazy_reading());
405 assert!(!DataFormat::Excel.supports_lazy_reading());
406 assert!(!DataFormat::Orc.supports_lazy_reading());
407
408 assert!(DataFormat::Csv.supports_streaming());
410 assert!(DataFormat::Tsv.supports_streaming());
411 assert!(DataFormat::Adt.supports_streaming());
412 assert!(!DataFormat::Parquet.supports_streaming());
413 assert!(!DataFormat::Avro.supports_streaming());
414 assert!(DataFormat::JsonLines.supports_streaming());
415 assert!(!DataFormat::Arrow.supports_streaming());
416 assert!(!DataFormat::Json.supports_streaming());
417 assert!(!DataFormat::JsonCompact.supports_streaming());
418 assert!(!DataFormat::Excel.supports_streaming());
419 assert!(!DataFormat::Orc.supports_streaming());
420 }
421
422 #[test]
423 fn test_content_detection() {
424 assert_eq!(
425 detect_format_from_content(b"PAR1"),
426 Some(DataFormat::Parquet)
427 );
428 assert_eq!(
429 detect_format_from_content(b"Obj\x01"),
430 Some(DataFormat::Avro)
431 );
432 assert_eq!(
433 detect_format_from_content(b"ARROW1\x00\x00"),
434 Some(DataFormat::Arrow)
435 );
436 assert_eq!(detect_format_from_content(b"ORC"), Some(DataFormat::Orc));
437
438 assert_eq!(
439 detect_format_from_content(b"[{\"a\": 1}]"),
440 Some(DataFormat::Json)
441 );
442 assert_eq!(
443 detect_format_from_content(b"{\"a\": 1}"),
444 Some(DataFormat::Json)
445 );
446 assert_eq!(
447 detect_format_from_content(b"{\"a\": 1}\n{\"b\": 2}"),
448 Some(DataFormat::JsonLines)
449 );
450
451 assert_eq!(
452 detect_format_from_content(b"a,b,c\n1,2,3\n4,5,6"),
453 Some(DataFormat::Csv)
454 );
455 assert_eq!(
456 detect_format_from_content(b"a\tb\tc\n1\t2\t3\n4\t5\t6"),
457 Some(DataFormat::Tsv)
458 );
459
460 assert_eq!(detect_format_from_content(b"random data"), None);
461 }
462
463 #[test]
464 fn test_from_path() {
465 use std::path::Path;
466 assert_eq!(
467 DataFormat::from_path(Path::new("file.csv")).unwrap(),
468 DataFormat::Csv
469 );
470 assert_eq!(
471 DataFormat::from_path(Path::new("file.CSV")).unwrap(),
472 DataFormat::Csv
473 );
474 assert_eq!(
475 DataFormat::from_path(Path::new("file.tsv")).unwrap(),
476 DataFormat::Tsv
477 );
478 assert_eq!(
479 DataFormat::from_path(Path::new("file.parquet")).unwrap(),
480 DataFormat::Parquet
481 );
482 assert_eq!(
483 DataFormat::from_path(Path::new("file.jsonl")).unwrap(),
484 DataFormat::JsonLines
485 );
486 assert_eq!(
487 DataFormat::from_path(Path::new("file.json")).unwrap(),
488 DataFormat::Json
489 );
490 assert_eq!(
491 DataFormat::from_path(Path::new("file.xlsx")).unwrap(),
492 DataFormat::Excel
493 );
494 assert!(DataFormat::from_path(Path::new("file")).is_err());
495 assert!(DataFormat::from_path(Path::new("file.unknown")).is_err());
496 }
497
498 #[test]
499 fn test_default_extension() {
500 assert_eq!(DataFormat::Csv.default_extension(), "csv");
501 assert_eq!(DataFormat::Tsv.default_extension(), "tsv");
502 assert_eq!(DataFormat::Adt.default_extension(), "adt");
503 assert_eq!(DataFormat::Parquet.default_extension(), "parquet");
504 assert_eq!(DataFormat::Avro.default_extension(), "avro");
505 assert_eq!(DataFormat::JsonLines.default_extension(), "jsonl");
506 assert_eq!(DataFormat::Arrow.default_extension(), "arrow");
507 assert_eq!(DataFormat::Json.default_extension(), "json");
508 assert_eq!(DataFormat::JsonCompact.default_extension(), "jsonc");
509 assert_eq!(DataFormat::Excel.default_extension(), "xlsx");
510 assert_eq!(DataFormat::Orc.default_extension(), "orc");
511 }
512
513 #[test]
514 fn test_display_name() {
515 assert_eq!(DataFormat::Csv.display_name(), "CSV");
516 assert_eq!(DataFormat::Tsv.display_name(), "TSV");
517 assert_eq!(DataFormat::Adt.display_name(), "ASCII Delimited Text");
518 assert_eq!(DataFormat::Parquet.display_name(), "Parquet");
519 assert_eq!(DataFormat::Avro.display_name(), "Avro");
520 assert_eq!(DataFormat::JsonLines.display_name(), "JSON Lines");
521 assert_eq!(DataFormat::Arrow.display_name(), "Arrow");
522 assert_eq!(DataFormat::Json.display_name(), "JSON");
523 assert_eq!(DataFormat::JsonCompact.display_name(), "JSON Compact");
524 assert_eq!(DataFormat::Excel.display_name(), "Excel");
525 assert_eq!(DataFormat::Orc.display_name(), "ORC");
526 }
527
528 #[test]
529 fn test_display_trait() {
530 assert_eq!(format!("{}", DataFormat::Csv), "CSV");
531 assert_eq!(format!("{}", DataFormat::JsonLines), "JSON Lines");
532 assert_eq!(format!("{}", DataFormat::Excel), "Excel");
533 }
534
535 #[test]
536 fn test_from_str_trait() {
537 assert_eq!("csv".parse::<DataFormat>().unwrap(), DataFormat::Csv);
538 assert_eq!(
539 "json-lines".parse::<DataFormat>().unwrap(),
540 DataFormat::JsonLines
541 );
542 assert_eq!("excel".parse::<DataFormat>().unwrap(), DataFormat::Excel);
543 assert!("invalid".parse::<DataFormat>().is_err());
544 }
545
546 #[test]
547 fn test_format_options_default() {
548 let opts = FormatOptions::default();
549 assert!(opts.content_detection);
550 assert_eq!(opts.detection_bytes, 8192);
551 }
552}