1use std::fmt;
2use std::path::Path;
3
4use crate::error::{DonglerError, Result};
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum InputFormat {
12 Text,
13 Pdf,
14 Excel,
15 LegacyExcel,
16 Word,
17 LegacyWord,
18 Presentation,
19 LegacyPresentation,
20 OpenDocument,
21 Archive,
22 Html,
23 Image,
24 Email,
25 Json,
26 Csv,
27 Xml,
28 LegacyEmail,
29}
30
31impl InputFormat {
32 pub fn detect_path(path: impl AsRef<Path>) -> Result<Self> {
33 let path = path.as_ref();
34 let extension = effective_extension(path).ok_or_else(|| DonglerError::UnknownFormat {
35 path: path.display().to_string(),
36 })?;
37
38 match extension.as_str() {
39 "txt" | "text" | "md" | "markdown" => Ok(Self::Text),
40 "pdf" => Ok(Self::Pdf),
41 "xlsx" => Ok(Self::Excel),
42 "xls" => Ok(Self::LegacyExcel),
43 "docx" => Ok(Self::Word),
44 "doc" => Ok(Self::LegacyWord),
45 "pptx" => Ok(Self::Presentation),
46 "ppt" => Ok(Self::LegacyPresentation),
47 "odt" | "ods" | "odp" => Ok(Self::OpenDocument),
48 "tar" | "tgz" | "gz" | "zip" => Ok(Self::Archive),
49 "html" | "htm" => Ok(Self::Html),
50 "png" | "jpg" | "jpeg" | "gif" | "bmp" | "tif" | "tiff" | "webp" => Ok(Self::Image),
51 "eml" => Ok(Self::Email),
52 "json" | "jsonl" | "ndjson" => Ok(Self::Json),
53 "csv" | "tsv" => Ok(Self::Csv),
54 "xml" | "nxml" | "tei" => Ok(Self::Xml),
55 "tex" | "latex" | "ltx" => Ok(Self::Text),
56 "msg" => Ok(Self::LegacyEmail),
57 _ => Err(DonglerError::UnknownFormat {
58 path: path.display().to_string(),
59 }),
60 }
61 }
62
63 pub fn as_str(self) -> &'static str {
64 match self {
65 Self::Text => "text",
66 Self::Pdf => "pdf",
67 Self::Excel | Self::LegacyExcel => "excel",
68 Self::Word | Self::LegacyWord => "word",
69 Self::Presentation | Self::LegacyPresentation => "presentation",
70 Self::OpenDocument => "opendocument",
71 Self::Archive => "archive",
72 Self::Html => "html",
73 Self::Image => "image",
74 Self::Email | Self::LegacyEmail => "email",
75 Self::Json => "json",
76 Self::Csv => "csv",
77 Self::Xml => "xml",
78 }
79 }
80
81 pub fn extraction_status(self) -> ExtractionStatus {
82 match self {
83 Self::Text
84 | Self::Pdf
85 | Self::Excel
86 | Self::Word
87 | Self::Presentation
88 | Self::OpenDocument
89 | Self::Archive
90 | Self::Html
91 | Self::Image
92 | Self::Email
93 | Self::Json
94 | Self::Csv
95 | Self::Xml => ExtractionStatus::Supported,
96 Self::LegacyExcel | Self::LegacyWord | Self::LegacyPresentation | Self::LegacyEmail => {
97 ExtractionStatus::Planned
98 }
99 }
100 }
101}
102
103fn effective_extension(path: &Path) -> Option<String> {
104 let extension = path
105 .extension()
106 .and_then(|extension| extension.to_str())?
107 .to_ascii_lowercase();
108 if extension == "tgz" {
109 return Some("tgz".to_owned());
110 }
111 if extension != "gz" {
112 return Some(extension);
113 }
114
115 let stem = path.file_stem().and_then(|stem| stem.to_str())?;
116 let inner_extension = Path::new(stem)
117 .extension()
118 .and_then(|extension| extension.to_str())
119 .map(|extension| extension.to_ascii_lowercase());
120 match inner_extension.as_deref() {
121 Some(
122 "txt" | "text" | "md" | "markdown" | "json" | "jsonl" | "ndjson" | "csv" | "tsv"
123 | "xml" | "nxml" | "tei" | "tex" | "latex" | "ltx" | "tar",
124 ) => inner_extension,
125 _ => Some("gz".to_owned()),
126 }
127}
128
129impl fmt::Display for InputFormat {
130 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
131 formatter.write_str(self.as_str())
132 }
133}
134
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136pub enum ExtractionStatus {
137 Supported,
138 Planned,
139}