Skip to main content

dongler_core/
source.rs

1use std::fs;
2use std::io::Read;
3use std::path::Path;
4
5use flate2::read::GzDecoder;
6
7use crate::error::Result;
8use crate::format::InputFormat;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Source {
12    pub content: String,
13    pub bytes: Option<Vec<u8>>,
14    pub format: String,
15    pub path: Option<String>,
16}
17
18impl Source {
19    pub fn from_text(text: impl Into<String>) -> Self {
20        Self {
21            content: text.into(),
22            bytes: None,
23            format: InputFormat::Text.as_str().to_owned(),
24            path: None,
25        }
26    }
27
28    /// Build a source from in-memory bytes without touching the filesystem.
29    ///
30    /// Binary formats keep their raw bytes and expose a lossy UTF-8 view as
31    /// `content`; this is the entry point used by the wasm bindings, where no
32    /// path-based loader is available.
33    pub fn from_bytes(bytes: Vec<u8>, format: impl Into<String>) -> Self {
34        Self {
35            content: String::from_utf8_lossy(&bytes).into_owned(),
36            bytes: Some(bytes),
37            format: format.into(),
38            path: None,
39        }
40    }
41
42    /// Build a source from in-memory bytes, mirroring how the path-based
43    /// loaders decode each format. `name` is the original file name (used for
44    /// gzip and markdown/latex detection) but is never read from disk.
45    pub fn from_bytes_for_format(bytes: &[u8], name: &str, format: InputFormat) -> Result<Self> {
46        if is_gzip_path(Path::new(name)) {
47            let mut decoder = GzDecoder::new(bytes);
48            let mut content = String::new();
49            decoder.read_to_string(&mut content)?;
50            return Ok(Self {
51                bytes: Some(bytes.to_vec()),
52                content,
53                format: format.as_str().to_owned(),
54                path: Some(name.to_owned()),
55            });
56        }
57
58        let is_text = matches!(
59            format,
60            InputFormat::Text
61                | InputFormat::Html
62                | InputFormat::Email
63                | InputFormat::Json
64                | InputFormat::Csv
65                | InputFormat::Xml
66        );
67        let content = String::from_utf8_lossy(bytes).into_owned();
68        let stored = if is_text {
69            content.as_bytes().to_vec()
70        } else {
71            bytes.to_vec()
72        };
73        Ok(Self {
74            content,
75            bytes: Some(stored),
76            format: format.as_str().to_owned(),
77            path: Some(name.to_owned()),
78        })
79    }
80
81    pub fn from_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
82        let path = path.as_ref();
83
84        let content = fs::read_to_string(path)?;
85
86        Ok(Self {
87            bytes: Some(content.as_bytes().to_vec()),
88            content,
89            format: format.into(),
90            path: Some(path.display().to_string()),
91        })
92    }
93
94    pub fn from_text_or_gzip_path(
95        path: impl AsRef<Path>,
96        format: impl Into<String>,
97    ) -> Result<Self> {
98        let path = path.as_ref();
99        if !is_gzip_path(path) {
100            return Self::from_path(path, format);
101        }
102
103        let bytes = fs::read(path)?;
104        let mut decoder = GzDecoder::new(bytes.as_slice());
105        let mut content = String::new();
106        decoder.read_to_string(&mut content)?;
107
108        Ok(Self {
109            bytes: Some(bytes),
110            content,
111            format: format.into(),
112            path: Some(path.display().to_string()),
113        })
114    }
115
116    pub fn from_pdf_path(path: impl AsRef<Path>) -> Result<Self> {
117        Self::from_binary_path(path, InputFormat::Pdf.as_str())
118    }
119
120    pub fn from_binary_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
121        let path = path.as_ref();
122        let bytes = fs::read(path)?;
123
124        Ok(Self {
125            content: String::from_utf8_lossy(&bytes).into_owned(),
126            bytes: Some(bytes),
127            format: format.into(),
128            path: Some(path.display().to_string()),
129        })
130    }
131}
132
133pub trait SourceLoader {
134    fn load(&self, path: &Path) -> Result<Source>;
135}
136
137#[derive(Debug, Default, Clone, Copy)]
138pub struct TextSourceLoader;
139
140impl SourceLoader for TextSourceLoader {
141    fn load(&self, path: &Path) -> Result<Source> {
142        Source::from_text_or_gzip_path(path, InputFormat::Text.as_str())
143    }
144}
145
146#[derive(Debug, Default, Clone, Copy)]
147pub struct PdfSourceLoader;
148
149impl SourceLoader for PdfSourceLoader {
150    fn load(&self, path: &Path) -> Result<Source> {
151        Source::from_pdf_path(path)
152    }
153}
154
155#[derive(Debug, Default, Clone, Copy)]
156pub struct ImageSourceLoader;
157
158impl SourceLoader for ImageSourceLoader {
159    fn load(&self, path: &Path) -> Result<Source> {
160        Source::from_binary_path(path, InputFormat::Image.as_str())
161    }
162}
163
164#[derive(Debug, Clone, Copy)]
165pub struct FormatSourceLoader {
166    format: InputFormat,
167}
168
169impl FormatSourceLoader {
170    pub fn new(format: InputFormat) -> Self {
171        Self { format }
172    }
173}
174
175impl SourceLoader for FormatSourceLoader {
176    fn load(&self, path: &Path) -> Result<Source> {
177        match self.format {
178            InputFormat::Text
179            | InputFormat::Html
180            | InputFormat::Email
181            | InputFormat::Json
182            | InputFormat::Csv
183            | InputFormat::Xml => Source::from_text_or_gzip_path(path, self.format.as_str()),
184            InputFormat::Pdf
185            | InputFormat::Image
186            | InputFormat::Word
187            | InputFormat::Excel
188            | InputFormat::Presentation
189            | InputFormat::OpenDocument
190            | InputFormat::Archive => Source::from_binary_path(path, self.format.as_str()),
191            InputFormat::LegacyWord
192            | InputFormat::LegacyExcel
193            | InputFormat::LegacyPresentation
194            | InputFormat::LegacyEmail => Source::from_binary_path(path, self.format.as_str()),
195        }
196    }
197}
198
199fn is_gzip_path(path: &Path) -> bool {
200    path.extension()
201        .and_then(|extension| extension.to_str())
202        .map(|extension| extension.eq_ignore_ascii_case("gz"))
203        .unwrap_or(false)
204}