1use std::collections::BTreeMap;
2
3use serde::Serialize;
4
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct OutputWarning {
7 pub path: String,
8 pub message: String,
9}
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum WarningCategory {
13 Parser,
14 Data,
15 Custom,
16}
17
18impl WarningCategory {
19 pub fn as_str(self) -> &'static str {
20 match self {
21 WarningCategory::Parser => "parser",
22 WarningCategory::Data => "data",
23 WarningCategory::Custom => "custom",
24 }
25 }
26}
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum WarningCode {
30 MalformedXml,
31 IgnoredWorkbookSheet,
32 SharedStringIndexOutOfBounds,
33 InvalidSharedStringIndex,
34 Custom,
35}
36
37impl WarningCode {
38 pub fn as_str(self) -> &'static str {
39 match self {
40 WarningCode::MalformedXml => "W001",
41 WarningCode::IgnoredWorkbookSheet => "W002",
42 WarningCode::SharedStringIndexOutOfBounds => "W003",
43 WarningCode::InvalidSharedStringIndex => "W004",
44 WarningCode::Custom => "W999",
45 }
46 }
47}
48
49impl OutputWarning {
50 pub fn new(path: impl Into<String>, message: impl Into<String>) -> Self {
51 Self {
52 path: path.into(),
53 message: message.into(),
54 }
55 }
56
57 pub fn malformed_xml(path: impl Into<String>, source: impl std::fmt::Display) -> Self {
58 Self::new(path, format!("stopped after malformed XML: {source}"))
59 }
60
61 pub fn ignored_workbook_sheet(path: impl Into<String>) -> Self {
62 Self::new(
63 path,
64 "ignored workbook sheet without name or relationship id",
65 )
66 }
67
68 pub fn shared_string_index_out_of_bounds(path: impl Into<String>, index: usize) -> Self {
69 Self::new(
70 path,
71 format!("shared string index {index} is out of bounds"),
72 )
73 }
74
75 pub fn invalid_shared_string_index(path: impl Into<String>, value: impl Into<String>) -> Self {
76 Self::new(
77 path,
78 format!("invalid shared string index '{}'", value.into()),
79 )
80 }
81
82 pub fn category(&self) -> WarningCategory {
83 match self.code() {
84 WarningCode::MalformedXml => WarningCategory::Parser,
85 WarningCode::IgnoredWorkbookSheet
86 | WarningCode::SharedStringIndexOutOfBounds
87 | WarningCode::InvalidSharedStringIndex => WarningCategory::Data,
88 WarningCode::Custom => WarningCategory::Custom,
89 }
90 }
91
92 pub fn code(&self) -> WarningCode {
93 match self.message.as_str() {
94 message if message.starts_with("stopped after malformed XML: ") => {
95 WarningCode::MalformedXml
96 }
97 "ignored workbook sheet without name or relationship id" => {
98 WarningCode::IgnoredWorkbookSheet
99 }
100 message
101 if message.starts_with("shared string index ")
102 && message.ends_with(" is out of bounds") =>
103 {
104 WarningCode::SharedStringIndexOutOfBounds
105 }
106 message if message.starts_with("invalid shared string index '") => {
107 WarningCode::InvalidSharedStringIndex
108 }
109 _ => WarningCode::Custom,
110 }
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum DocumentType {
116 Docx,
117 Pptx,
118 Xlsx,
119 Unknown,
120}
121
122#[derive(Debug, Clone, PartialEq, Eq)]
123pub struct XlsxSheet {
124 pub index: usize,
125 pub name: String,
126 pub visibility: XlsxSheetVisibility,
127}
128
129#[derive(Debug, Clone, Copy, PartialEq, Eq)]
130pub enum XlsxSheetVisibility {
131 Visible,
132 Hidden,
133 VeryHidden,
134}
135
136impl XlsxSheetVisibility {
137 pub fn as_str(self) -> &'static str {
138 match self {
139 Self::Visible => "visible",
140 Self::Hidden => "hidden",
141 Self::VeryHidden => "veryHidden",
142 }
143 }
144}
145
146#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)]
147pub struct StructuredText {
148 pub document_type: String,
149 pub blocks: Vec<TextBlock>,
150}
151
152#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
153pub struct TextBlock {
154 pub part_type: String,
155 pub part_path: String,
156 pub ordinal: usize,
157 pub text: String,
158}
159
160impl TextBlock {
161 pub fn new(
162 part_type: impl Into<String>,
163 part_path: impl Into<String>,
164 ordinal: usize,
165 text: impl Into<String>,
166 ) -> Self {
167 Self {
168 part_type: part_type.into(),
169 part_path: part_path.into(),
170 ordinal,
171 text: text.into(),
172 }
173 }
174}
175
176#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
177pub enum XlsxValueMode {
178 #[default]
179 Raw,
180 Formatted,
181}
182
183#[derive(Debug, Clone, PartialEq, Eq)]
184pub struct Extraction<T> {
185 pub value: T,
186 pub warnings: Vec<OutputWarning>,
187}
188
189impl<T> Extraction<T> {
190 pub fn new(value: T) -> Self {
191 Self {
192 value,
193 warnings: Vec::new(),
194 }
195 }
196
197 pub fn with_warnings(value: T, warnings: Vec<OutputWarning>) -> Self {
198 Self { value, warnings }
199 }
200
201 pub fn map<U>(self, f: impl FnOnce(T) -> U) -> Extraction<U> {
202 Extraction {
203 value: f(self.value),
204 warnings: self.warnings,
205 }
206 }
207}
208
209#[derive(Debug, Clone, Copy, PartialEq, Eq)]
210pub struct XlsxCsvOptions<'a> {
211 pub sheet_name: Option<&'a str>,
212 pub sheet_index: Option<usize>,
213 pub include_hidden: bool,
214 pub delimiter: u8,
215}
216
217impl Default for XlsxCsvOptions<'_> {
218 fn default() -> Self {
219 Self {
220 sheet_name: None,
221 sheet_index: None,
222 include_hidden: false,
223 delimiter: b',',
224 }
225 }
226}
227
228#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)]
229pub struct DocumentInfo {
230 pub file: String,
231 #[serde(skip_serializing_if = "Option::is_none")]
232 pub author: Option<String>,
233 #[serde(skip_serializing_if = "Option::is_none")]
234 pub last_modified_by: Option<String>,
235 #[serde(skip_serializing_if = "Option::is_none")]
236 pub created_at: Option<String>,
237 #[serde(skip_serializing_if = "Option::is_none")]
238 pub modified_at: Option<String>,
239 #[serde(skip_serializing_if = "Option::is_none")]
240 pub application: Option<String>,
241 #[serde(skip_serializing_if = "Option::is_none")]
242 pub company: Option<String>,
243 #[serde(skip_serializing_if = "Option::is_none")]
244 pub custom_properties: Option<BTreeMap<String, String>>,
245 pub has_macros: bool,
246 #[serde(skip_serializing_if = "Option::is_none")]
247 pub word_count: Option<u64>,
248 #[serde(skip_serializing_if = "Option::is_none")]
249 pub page_count: Option<u64>,
250 #[serde(skip_serializing_if = "Option::is_none")]
251 pub slide_count: Option<u64>,
252 #[serde(skip_serializing_if = "Option::is_none")]
253 pub worksheet_count: Option<u64>,
254 #[serde(skip_serializing_if = "Option::is_none")]
255 pub revision: Option<String>,
256}
257
258#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)]
259pub struct DocumentAudit {
260 pub file: String,
261 pub document_type: String,
262 pub metadata: DocumentInfo,
263 pub signals: Vec<AuditSignal>,
264}
265
266#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
267pub struct AuditSignal {
268 pub kind: String,
269 pub severity: String,
270 pub path: String,
271 pub message: String,
272}
273
274impl AuditSignal {
275 pub fn new(
276 kind: impl Into<String>,
277 severity: impl Into<String>,
278 path: impl Into<String>,
279 message: impl Into<String>,
280 ) -> Self {
281 Self {
282 kind: kind.into(),
283 severity: severity.into(),
284 path: path.into(),
285 message: message.into(),
286 }
287 }
288}
289
290#[cfg(test)]
291mod tests {
292 use super::{Extraction, OutputWarning, WarningCategory, WarningCode, XlsxCsvOptions};
293
294 #[test]
295 fn builds_and_maps_extractions() {
296 let warning = OutputWarning::malformed_xml("word/document.xml", "parse error");
297 let extraction = Extraction::with_warnings("hello".to_owned(), vec![warning.clone()]);
298
299 let mapped = extraction.map(|value| value.len());
300
301 assert_eq!(mapped.value, 5);
302 assert_eq!(mapped.warnings, vec![warning]);
303 assert!(Extraction::new(()).warnings.is_empty());
304 }
305
306 #[test]
307 fn classifies_warning_codes_and_categories() {
308 let malformed = OutputWarning::malformed_xml("word/document.xml", "parse error");
309 let sheet = OutputWarning::ignored_workbook_sheet("xl/workbook.xml");
310 let shared = OutputWarning::shared_string_index_out_of_bounds("xl/sheet.xml", 7);
311 let invalid = OutputWarning::invalid_shared_string_index("xl/sheet.xml", "abc");
312
313 assert_eq!(malformed.category(), WarningCategory::Parser);
314 assert_eq!(malformed.code(), WarningCode::MalformedXml);
315 assert_eq!(sheet.category(), WarningCategory::Data);
316 assert_eq!(sheet.code(), WarningCode::IgnoredWorkbookSheet);
317 assert_eq!(shared.code(), WarningCode::SharedStringIndexOutOfBounds);
318 assert_eq!(invalid.code(), WarningCode::InvalidSharedStringIndex);
319 assert_eq!(WarningCategory::Parser.as_str(), "parser");
320 assert_eq!(WarningCategory::Data.as_str(), "data");
321 assert_eq!(WarningCode::MalformedXml.as_str(), "W001");
322 assert_eq!(WarningCode::IgnoredWorkbookSheet.as_str(), "W002");
323 assert_eq!(WarningCode::SharedStringIndexOutOfBounds.as_str(), "W003");
324 assert_eq!(WarningCode::InvalidSharedStringIndex.as_str(), "W004");
325 }
326
327 #[test]
328 fn classifies_unknown_warnings_as_custom() {
329 let warning = OutputWarning::new("custom.xml", "partial extraction");
330
331 assert_eq!(warning.category(), WarningCategory::Custom);
332 assert_eq!(warning.code(), WarningCode::Custom);
333 assert_eq!(warning.category().as_str(), "custom");
334 assert_eq!(warning.code().as_str(), "W999");
335 }
336
337 #[test]
338 fn defaults_xlsx_csv_options() {
339 let options = XlsxCsvOptions::default();
340
341 assert_eq!(options.sheet_name, None);
342 assert_eq!(options.sheet_index, None);
343 assert!(!options.include_hidden);
344 assert_eq!(options.delimiter, b',');
345 }
346}