1use std::fs;
2use std::io::Read;
3use std::path::Path;
4
5use flate2::read::GzDecoder;
6
7use crate::error::Result;
8use crate::format::InputFormat;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Source {
12 pub content: String,
13 pub bytes: Option<Vec<u8>>,
14 pub format: String,
15 pub path: Option<String>,
16}
17
18impl Source {
19 pub fn from_text(text: impl Into<String>) -> Self {
20 Self {
21 content: text.into(),
22 bytes: None,
23 format: InputFormat::Text.as_str().to_owned(),
24 path: None,
25 }
26 }
27
28 pub fn from_bytes(bytes: Vec<u8>, format: impl Into<String>) -> Self {
34 Self {
35 content: String::from_utf8_lossy(&bytes).into_owned(),
36 bytes: Some(bytes),
37 format: format.into(),
38 path: None,
39 }
40 }
41
42 pub fn from_bytes_for_format(bytes: &[u8], name: &str, format: InputFormat) -> Result<Self> {
46 if is_gzip_path(Path::new(name)) {
47 let mut decoder = GzDecoder::new(bytes);
48 let mut content = String::new();
49 decoder.read_to_string(&mut content)?;
50 return Ok(Self {
51 bytes: Some(bytes.to_vec()),
52 content,
53 format: format.as_str().to_owned(),
54 path: Some(name.to_owned()),
55 });
56 }
57
58 let is_text = matches!(
59 format,
60 InputFormat::Text
61 | InputFormat::Html
62 | InputFormat::Email
63 | InputFormat::Json
64 | InputFormat::Csv
65 | InputFormat::Xml
66 );
67 let content = String::from_utf8_lossy(bytes).into_owned();
68 let stored = if is_text {
69 content.as_bytes().to_vec()
70 } else {
71 bytes.to_vec()
72 };
73 Ok(Self {
74 content,
75 bytes: Some(stored),
76 format: format.as_str().to_owned(),
77 path: Some(name.to_owned()),
78 })
79 }
80
81 pub fn from_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
82 let path = path.as_ref();
83
84 let content = fs::read_to_string(path)?;
85
86 Ok(Self {
87 bytes: Some(content.as_bytes().to_vec()),
88 content,
89 format: format.into(),
90 path: Some(path.display().to_string()),
91 })
92 }
93
94 pub fn from_text_or_gzip_path(
95 path: impl AsRef<Path>,
96 format: impl Into<String>,
97 ) -> Result<Self> {
98 let path = path.as_ref();
99 if !is_gzip_path(path) {
100 return Self::from_path(path, format);
101 }
102
103 let bytes = fs::read(path)?;
104 let mut decoder = GzDecoder::new(bytes.as_slice());
105 let mut content = String::new();
106 decoder.read_to_string(&mut content)?;
107
108 Ok(Self {
109 bytes: Some(bytes),
110 content,
111 format: format.into(),
112 path: Some(path.display().to_string()),
113 })
114 }
115
116 pub fn from_pdf_path(path: impl AsRef<Path>) -> Result<Self> {
117 Self::from_binary_path(path, InputFormat::Pdf.as_str())
118 }
119
120 pub fn from_binary_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
121 let path = path.as_ref();
122 let bytes = fs::read(path)?;
123
124 Ok(Self {
125 content: String::from_utf8_lossy(&bytes).into_owned(),
126 bytes: Some(bytes),
127 format: format.into(),
128 path: Some(path.display().to_string()),
129 })
130 }
131}
132
133pub trait SourceLoader {
134 fn load(&self, path: &Path) -> Result<Source>;
135}
136
137#[derive(Debug, Default, Clone, Copy)]
138pub struct TextSourceLoader;
139
140impl SourceLoader for TextSourceLoader {
141 fn load(&self, path: &Path) -> Result<Source> {
142 Source::from_text_or_gzip_path(path, InputFormat::Text.as_str())
143 }
144}
145
146#[derive(Debug, Default, Clone, Copy)]
147pub struct PdfSourceLoader;
148
149impl SourceLoader for PdfSourceLoader {
150 fn load(&self, path: &Path) -> Result<Source> {
151 Source::from_pdf_path(path)
152 }
153}
154
155#[derive(Debug, Default, Clone, Copy)]
156pub struct ImageSourceLoader;
157
158impl SourceLoader for ImageSourceLoader {
159 fn load(&self, path: &Path) -> Result<Source> {
160 Source::from_binary_path(path, InputFormat::Image.as_str())
161 }
162}
163
164#[derive(Debug, Clone, Copy)]
165pub struct FormatSourceLoader {
166 format: InputFormat,
167}
168
169impl FormatSourceLoader {
170 pub fn new(format: InputFormat) -> Self {
171 Self { format }
172 }
173}
174
175impl SourceLoader for FormatSourceLoader {
176 fn load(&self, path: &Path) -> Result<Source> {
177 match self.format {
178 InputFormat::Text
179 | InputFormat::Html
180 | InputFormat::Email
181 | InputFormat::Json
182 | InputFormat::Csv
183 | InputFormat::Xml => Source::from_text_or_gzip_path(path, self.format.as_str()),
184 InputFormat::Pdf
185 | InputFormat::Image
186 | InputFormat::Word
187 | InputFormat::Excel
188 | InputFormat::Presentation
189 | InputFormat::OpenDocument
190 | InputFormat::Archive => Source::from_binary_path(path, self.format.as_str()),
191 InputFormat::LegacyWord
192 | InputFormat::LegacyExcel
193 | InputFormat::LegacyPresentation
194 | InputFormat::LegacyEmail => Source::from_binary_path(path, self.format.as_str()),
195 }
196 }
197}
198
199fn is_gzip_path(path: &Path) -> bool {
200 path.extension()
201 .and_then(|extension| extension.to_str())
202 .map(|extension| extension.eq_ignore_ascii_case("gz"))
203 .unwrap_or(false)
204}