Skip to main content

dongler_core/
format.rs

1use std::fmt;
2use std::path::Path;
3
4use crate::error::{DonglerError, Result};
5
6/// File formats Dongler can identify from a path.
7///
8/// File formats are modelled explicitly so loaders and engines can be added
9/// without changing the public format names returned by `detect_format`.
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum InputFormat {
12    Text,
13    Pdf,
14    Excel,
15    LegacyExcel,
16    Word,
17    LegacyWord,
18    Presentation,
19    LegacyPresentation,
20    OpenDocument,
21    Archive,
22    Html,
23    Image,
24    Email,
25    Json,
26    Csv,
27    Xml,
28    LegacyEmail,
29}
30
31impl InputFormat {
32    pub fn detect_path(path: impl AsRef<Path>) -> Result<Self> {
33        let path = path.as_ref();
34        let extension = effective_extension(path).ok_or_else(|| DonglerError::UnknownFormat {
35            path: path.display().to_string(),
36        })?;
37
38        match extension.as_str() {
39            "txt" | "text" | "md" | "markdown" => Ok(Self::Text),
40            "pdf" => Ok(Self::Pdf),
41            "xlsx" => Ok(Self::Excel),
42            "xls" => Ok(Self::LegacyExcel),
43            "docx" => Ok(Self::Word),
44            "doc" => Ok(Self::LegacyWord),
45            "pptx" => Ok(Self::Presentation),
46            "ppt" => Ok(Self::LegacyPresentation),
47            "odt" | "ods" | "odp" => Ok(Self::OpenDocument),
48            "tar" | "tgz" | "gz" | "zip" => Ok(Self::Archive),
49            "html" | "htm" => Ok(Self::Html),
50            "png" | "jpg" | "jpeg" | "gif" | "bmp" | "tif" | "tiff" | "webp" => Ok(Self::Image),
51            "eml" => Ok(Self::Email),
52            "json" | "jsonl" | "ndjson" => Ok(Self::Json),
53            "csv" | "tsv" => Ok(Self::Csv),
54            "xml" | "nxml" | "tei" => Ok(Self::Xml),
55            "tex" | "latex" | "ltx" => Ok(Self::Text),
56            "msg" => Ok(Self::LegacyEmail),
57            _ => Err(DonglerError::UnknownFormat {
58                path: path.display().to_string(),
59            }),
60        }
61    }
62
63    pub fn as_str(self) -> &'static str {
64        match self {
65            Self::Text => "text",
66            Self::Pdf => "pdf",
67            Self::Excel | Self::LegacyExcel => "excel",
68            Self::Word | Self::LegacyWord => "word",
69            Self::Presentation | Self::LegacyPresentation => "presentation",
70            Self::OpenDocument => "opendocument",
71            Self::Archive => "archive",
72            Self::Html => "html",
73            Self::Image => "image",
74            Self::Email | Self::LegacyEmail => "email",
75            Self::Json => "json",
76            Self::Csv => "csv",
77            Self::Xml => "xml",
78        }
79    }
80
81    pub fn extraction_status(self) -> ExtractionStatus {
82        match self {
83            Self::Text
84            | Self::Pdf
85            | Self::Excel
86            | Self::Word
87            | Self::Presentation
88            | Self::OpenDocument
89            | Self::Archive
90            | Self::Html
91            | Self::Image
92            | Self::Email
93            | Self::Json
94            | Self::Csv
95            | Self::Xml => ExtractionStatus::Supported,
96            Self::LegacyExcel | Self::LegacyWord | Self::LegacyPresentation | Self::LegacyEmail => {
97                ExtractionStatus::Planned
98            }
99        }
100    }
101}
102
103fn effective_extension(path: &Path) -> Option<String> {
104    let extension = path
105        .extension()
106        .and_then(|extension| extension.to_str())?
107        .to_ascii_lowercase();
108    if extension == "tgz" {
109        return Some("tgz".to_owned());
110    }
111    if extension != "gz" {
112        return Some(extension);
113    }
114
115    let stem = path.file_stem().and_then(|stem| stem.to_str())?;
116    let inner_extension = Path::new(stem)
117        .extension()
118        .and_then(|extension| extension.to_str())
119        .map(|extension| extension.to_ascii_lowercase());
120    match inner_extension.as_deref() {
121        Some(
122            "txt" | "text" | "md" | "markdown" | "json" | "jsonl" | "ndjson" | "csv" | "tsv"
123            | "xml" | "nxml" | "tei" | "tex" | "latex" | "ltx" | "tar",
124        ) => inner_extension,
125        _ => Some("gz".to_owned()),
126    }
127}
128
129impl fmt::Display for InputFormat {
130    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
131        formatter.write_str(self.as_str())
132    }
133}
134
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136pub enum ExtractionStatus {
137    Supported,
138    Planned,
139}