1use std::collections::BTreeMap;
2
3use serde::Serialize;
4
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct OutputWarning {
7 pub path: String,
8 pub message: String,
9}
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum WarningCategory {
13 Parser,
14 Data,
15 Custom,
16}
17
18impl WarningCategory {
19 pub fn as_str(self) -> &'static str {
20 match self {
21 WarningCategory::Parser => "parser",
22 WarningCategory::Data => "data",
23 WarningCategory::Custom => "custom",
24 }
25 }
26}
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum WarningCode {
30 MalformedXml,
31 IgnoredWorkbookSheet,
32 SharedStringIndexOutOfBounds,
33 InvalidSharedStringIndex,
34 Custom,
35}
36
37impl WarningCode {
38 pub fn as_str(self) -> &'static str {
39 match self {
40 WarningCode::MalformedXml => "W001",
41 WarningCode::IgnoredWorkbookSheet => "W002",
42 WarningCode::SharedStringIndexOutOfBounds => "W003",
43 WarningCode::InvalidSharedStringIndex => "W004",
44 WarningCode::Custom => "W999",
45 }
46 }
47}
48
49impl OutputWarning {
50 pub fn new(path: impl Into<String>, message: impl Into<String>) -> Self {
51 Self {
52 path: path.into(),
53 message: message.into(),
54 }
55 }
56
57 pub fn malformed_xml(path: impl Into<String>, source: impl std::fmt::Display) -> Self {
58 Self::new(path, format!("stopped after malformed XML: {source}"))
59 }
60
61 pub fn ignored_workbook_sheet(path: impl Into<String>) -> Self {
62 Self::new(
63 path,
64 "ignored workbook sheet without name or relationship id",
65 )
66 }
67
68 pub fn shared_string_index_out_of_bounds(path: impl Into<String>, index: usize) -> Self {
69 Self::new(
70 path,
71 format!("shared string index {index} is out of bounds"),
72 )
73 }
74
75 pub fn invalid_shared_string_index(path: impl Into<String>, value: impl Into<String>) -> Self {
76 Self::new(
77 path,
78 format!("invalid shared string index '{}'", value.into()),
79 )
80 }
81
82 pub fn category(&self) -> WarningCategory {
83 match self.code() {
84 WarningCode::MalformedXml => WarningCategory::Parser,
85 WarningCode::IgnoredWorkbookSheet
86 | WarningCode::SharedStringIndexOutOfBounds
87 | WarningCode::InvalidSharedStringIndex => WarningCategory::Data,
88 WarningCode::Custom => WarningCategory::Custom,
89 }
90 }
91
92 pub fn code(&self) -> WarningCode {
93 match self.message.as_str() {
94 message if message.starts_with("stopped after malformed XML: ") => {
95 WarningCode::MalformedXml
96 }
97 "ignored workbook sheet without name or relationship id" => {
98 WarningCode::IgnoredWorkbookSheet
99 }
100 message
101 if message.starts_with("shared string index ")
102 && message.ends_with(" is out of bounds") =>
103 {
104 WarningCode::SharedStringIndexOutOfBounds
105 }
106 message if message.starts_with("invalid shared string index '") => {
107 WarningCode::InvalidSharedStringIndex
108 }
109 _ => WarningCode::Custom,
110 }
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum DocumentType {
116 Docx,
117 Pptx,
118 Xlsx,
119 Unknown,
120}
121
122#[derive(Debug, Clone, PartialEq, Eq)]
123pub struct XlsxSheet {
124 pub index: usize,
125 pub name: String,
126}
127
128#[derive(Debug, Clone, PartialEq, Eq)]
129pub struct Extraction<T> {
130 pub value: T,
131 pub warnings: Vec<OutputWarning>,
132}
133
134impl<T> Extraction<T> {
135 pub fn new(value: T) -> Self {
136 Self {
137 value,
138 warnings: Vec::new(),
139 }
140 }
141
142 pub fn with_warnings(value: T, warnings: Vec<OutputWarning>) -> Self {
143 Self { value, warnings }
144 }
145
146 pub fn map<U>(self, f: impl FnOnce(T) -> U) -> Extraction<U> {
147 Extraction {
148 value: f(self.value),
149 warnings: self.warnings,
150 }
151 }
152}
153
154#[derive(Debug, Clone, Copy, PartialEq, Eq)]
155pub struct XlsxCsvOptions<'a> {
156 pub sheet_name: Option<&'a str>,
157 pub sheet_index: Option<usize>,
158 pub delimiter: u8,
159}
160
161impl Default for XlsxCsvOptions<'_> {
162 fn default() -> Self {
163 Self {
164 sheet_name: None,
165 sheet_index: None,
166 delimiter: b',',
167 }
168 }
169}
170
171#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)]
172pub struct DocumentInfo {
173 pub file: String,
174 #[serde(skip_serializing_if = "Option::is_none")]
175 pub author: Option<String>,
176 #[serde(skip_serializing_if = "Option::is_none")]
177 pub last_modified_by: Option<String>,
178 #[serde(skip_serializing_if = "Option::is_none")]
179 pub created_at: Option<String>,
180 #[serde(skip_serializing_if = "Option::is_none")]
181 pub modified_at: Option<String>,
182 #[serde(skip_serializing_if = "Option::is_none")]
183 pub application: Option<String>,
184 #[serde(skip_serializing_if = "Option::is_none")]
185 pub company: Option<String>,
186 #[serde(skip_serializing_if = "Option::is_none")]
187 pub custom_properties: Option<BTreeMap<String, String>>,
188 pub has_macros: bool,
189 #[serde(skip_serializing_if = "Option::is_none")]
190 pub word_count: Option<u64>,
191 #[serde(skip_serializing_if = "Option::is_none")]
192 pub page_count: Option<u64>,
193 #[serde(skip_serializing_if = "Option::is_none")]
194 pub slide_count: Option<u64>,
195 #[serde(skip_serializing_if = "Option::is_none")]
196 pub worksheet_count: Option<u64>,
197 #[serde(skip_serializing_if = "Option::is_none")]
198 pub revision: Option<String>,
199}
200
201#[cfg(test)]
202mod tests {
203 use super::{Extraction, OutputWarning, WarningCategory, WarningCode, XlsxCsvOptions};
204
205 #[test]
206 fn builds_and_maps_extractions() {
207 let warning = OutputWarning::malformed_xml("word/document.xml", "parse error");
208 let extraction = Extraction::with_warnings("hello".to_owned(), vec![warning.clone()]);
209
210 let mapped = extraction.map(|value| value.len());
211
212 assert_eq!(mapped.value, 5);
213 assert_eq!(mapped.warnings, vec![warning]);
214 assert!(Extraction::new(()).warnings.is_empty());
215 }
216
217 #[test]
218 fn classifies_warning_codes_and_categories() {
219 let malformed = OutputWarning::malformed_xml("word/document.xml", "parse error");
220 let sheet = OutputWarning::ignored_workbook_sheet("xl/workbook.xml");
221 let shared = OutputWarning::shared_string_index_out_of_bounds("xl/sheet.xml", 7);
222 let invalid = OutputWarning::invalid_shared_string_index("xl/sheet.xml", "abc");
223
224 assert_eq!(malformed.category(), WarningCategory::Parser);
225 assert_eq!(malformed.code(), WarningCode::MalformedXml);
226 assert_eq!(sheet.category(), WarningCategory::Data);
227 assert_eq!(sheet.code(), WarningCode::IgnoredWorkbookSheet);
228 assert_eq!(shared.code(), WarningCode::SharedStringIndexOutOfBounds);
229 assert_eq!(invalid.code(), WarningCode::InvalidSharedStringIndex);
230 assert_eq!(WarningCategory::Parser.as_str(), "parser");
231 assert_eq!(WarningCategory::Data.as_str(), "data");
232 assert_eq!(WarningCode::MalformedXml.as_str(), "W001");
233 assert_eq!(WarningCode::IgnoredWorkbookSheet.as_str(), "W002");
234 assert_eq!(WarningCode::SharedStringIndexOutOfBounds.as_str(), "W003");
235 assert_eq!(WarningCode::InvalidSharedStringIndex.as_str(), "W004");
236 }
237
238 #[test]
239 fn classifies_unknown_warnings_as_custom() {
240 let warning = OutputWarning::new("custom.xml", "partial extraction");
241
242 assert_eq!(warning.category(), WarningCategory::Custom);
243 assert_eq!(warning.code(), WarningCode::Custom);
244 assert_eq!(warning.category().as_str(), "custom");
245 assert_eq!(warning.code().as_str(), "W999");
246 }
247
248 #[test]
249 fn defaults_xlsx_csv_options() {
250 let options = XlsxCsvOptions::default();
251
252 assert_eq!(options.sheet_name, None);
253 assert_eq!(options.sheet_index, None);
254 assert_eq!(options.delimiter, b',');
255 }
256}