anomalyx_normalize/parsers/
xlsx.rs1use crate::parser::{Confidence, FormatParser, MAGIC};
14use crate::table::TableBuilder;
15use ax_core::{AxError, Column, Value};
16use calamine::{open_workbook_auto_from_rs, Data, Reader};
17use std::collections::BTreeMap;
18use std::io::Cursor;
19
20#[derive(Debug, Default, Clone)]
21pub struct XlsxParser;
22
23fn contains_seq(haystack: &[u8], needle: &[u8]) -> bool {
25 haystack.windows(needle.len()).any(|w| w == needle)
26}
27
28fn data_to_value(cell: &Data) -> Value {
30 match cell {
31 Data::Int(i) => Value::Int(*i),
32 Data::Float(f) => {
33 if f.is_finite() {
34 Value::Float(*f)
35 } else {
36 Value::Null
37 }
38 }
39 Data::String(s) => Value::Str(s.clone()),
40 Data::Bool(b) => Value::Bool(*b),
41 Data::DateTime(dt) => Value::Float(dt.as_f64()),
43 Data::DateTimeIso(s) | Data::DurationIso(s) => Value::Str(s.clone()),
44 Data::Error(_) | Data::Empty => Value::Null,
45 }
46}
47
48fn header_name(cell: &Data, index: usize) -> String {
51 match cell {
52 Data::String(s) if !s.trim().is_empty() => s.trim().to_string(),
53 _ => format!("col{index}"),
54 }
55}
56
57impl XlsxParser {
58 fn err(&self, msg: impl std::fmt::Display) -> AxError {
59 AxError::Parse {
60 format: self.id().to_string(),
61 message: msg.to_string(),
62 }
63 }
64}
65
66impl FormatParser for XlsxParser {
67 fn id(&self) -> &'static str {
68 "xlsx"
69 }
70 fn extensions(&self) -> &'static [&'static str] {
71 &["xlsx", "xls", "xlsb", "ods"]
72 }
73 fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
74 if !bytes.starts_with(b"PK\x03\x04") {
75 return None; }
77 (contains_seq(bytes, b"xl/") || contains_seq(bytes, b"opendocument.spreadsheet"))
81 .then_some(MAGIC)
82 }
83 fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
84 let mut workbook =
85 open_workbook_auto_from_rs(Cursor::new(bytes.to_vec())).map_err(|e| self.err(e))?;
86 let sheet = workbook
87 .sheet_names()
88 .first()
89 .cloned()
90 .ok_or_else(|| self.err("workbook has no sheets"))?;
91 let range = workbook.worksheet_range(&sheet).map_err(|e| self.err(e))?;
92
93 let mut rows = range.rows();
94 let Some(header) = rows.next() else {
95 return Ok(Vec::new()); };
97 let names: Vec<String> = header
98 .iter()
99 .enumerate()
100 .map(|(i, cell)| header_name(cell, i))
101 .collect();
102
103 let mut builder = TableBuilder::new();
104 for row in rows {
105 let mut record: BTreeMap<String, Value> = BTreeMap::new();
106 for (name, cell) in names.iter().zip(row) {
107 record.insert(name.clone(), data_to_value(cell));
108 }
109 builder.push_row(record);
110 }
111 Ok(builder.finish())
112 }
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118 use calamine::{CellErrorType, ExcelDateTime, ExcelDateTimeType};
119 use rust_xlsxwriter::Workbook;
120
121 fn build_xlsx() -> Vec<u8> {
123 let mut wb = Workbook::new();
124 let ws = wb.add_worksheet();
125 for (c, h) in ["name", "score", "active"].iter().enumerate() {
126 ws.write(0, c as u16, *h).unwrap();
127 }
128 ws.write(1, 0, "alice").unwrap();
129 ws.write(1, 1, 95).unwrap();
130 ws.write(1, 2, true).unwrap();
131 ws.write(2, 0, "bob").unwrap();
132 ws.write(2, 1, 42.5).unwrap();
133 ws.write(2, 2, false).unwrap();
134 wb.save_to_buffer().unwrap()
135 }
136
137 fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
138 cols.iter()
139 .find(|c| c.name == name)
140 .unwrap_or_else(|| panic!("missing column {name}"))
141 }
142
143 #[test]
144 fn roundtrip_first_sheet_to_records() {
145 let bytes = build_xlsx();
146 let cols = XlsxParser.parse("book.xlsx", &bytes).unwrap();
147 assert_eq!(
148 col(&cols, "name").cells,
149 vec![Value::Str("alice".into()), Value::Str("bob".into())]
150 );
151 let score = col(&cols, "score");
152 assert_eq!(score.numeric(), vec![95.0, 42.5]);
153 assert_eq!(
154 col(&cols, "active").cells,
155 vec![Value::Bool(true), Value::Bool(false)]
156 );
157 }
158
159 #[test]
160 fn data_to_value_units() {
161 assert_eq!(data_to_value(&Data::Int(7)), Value::Int(7));
162 assert_eq!(data_to_value(&Data::Float(1.5)), Value::Float(1.5));
163 assert_eq!(data_to_value(&Data::Float(f64::NAN)), Value::Null); assert_eq!(
165 data_to_value(&Data::String("x".into())),
166 Value::Str("x".into())
167 );
168 assert_eq!(data_to_value(&Data::Bool(true)), Value::Bool(true));
169 assert_eq!(data_to_value(&Data::Empty), Value::Null);
170 assert_eq!(
171 data_to_value(&Data::Error(CellErrorType::Div0)),
172 Value::Null
173 );
174 assert_eq!(
175 data_to_value(&Data::DateTimeIso("2021-01-01".into())),
176 Value::Str("2021-01-01".into())
177 );
178 let dt = ExcelDateTime::new(44197.0, ExcelDateTimeType::DateTime, false);
180 assert_eq!(data_to_value(&Data::DateTime(dt)), Value::Float(44197.0));
181 }
182
183 #[test]
184 fn header_name_units() {
185 assert_eq!(header_name(&Data::String("score".into()), 1), "score");
186 assert_eq!(header_name(&Data::String(" ".into()), 1), "col1"); assert_eq!(header_name(&Data::Empty, 2), "col2");
188 assert_eq!(header_name(&Data::Int(5), 0), "col0"); }
190
191 #[test]
192 fn malformed_input_errors() {
193 assert!(matches!(
194 XlsxParser.parse("book.xlsx", b"not a spreadsheet"),
195 Err(AxError::Parse { .. })
196 ));
197 assert!(matches!(
199 XlsxParser.parse("book.xlsx", b"PK\x03\x04 garbage"),
200 Err(AxError::Parse { .. })
201 ));
202 }
203
204 #[test]
205 fn sniff_keys_on_zip_plus_spreadsheet_marker() {
206 assert_eq!(XlsxParser.sniff(&build_xlsx()), Some(MAGIC));
207 assert_eq!(XlsxParser.sniff(b"PK\x03\x04....word/document.xml"), None);
209 assert_eq!(XlsxParser.sniff(b"not a zip"), None);
210 assert_eq!(XlsxParser.sniff(b"PK"), None); }
212
213 #[test]
214 fn contains_seq_units() {
215 assert!(contains_seq(b"hello xl/ world", b"xl/"));
216 assert!(!contains_seq(b"hello world", b"xl/"));
217 assert!(!contains_seq(b"ab", b"abc")); }
219
220 #[test]
221 fn claims_spreadsheet_extensions() {
222 assert_eq!(XlsxParser.extensions(), &["xlsx", "xls", "xlsb", "ods"]);
223 }
224
225 #[test]
226 fn resolves_by_extension_and_magic() {
227 let reg = crate::parser::ParserRegistry::default();
228 assert_eq!(reg.resolve("book.xlsx", b"zz").unwrap().id(), "xlsx");
229 assert_eq!(reg.resolve("sheet.ods", b"zz").unwrap().id(), "xlsx");
230 assert_eq!(reg.resolve("-", &build_xlsx()).unwrap().id(), "xlsx");
231 }
232}