Skip to main content

anomalyx_normalize/parsers/
xlsx.rs

1//! Excel / OpenDocument spreadsheet parser — the universal business handoff.
2//!
3//! The first worksheet of a workbook (`.xlsx`/`.xls`/`.xlsb`/`.ods`) becomes a
4//! RecordSet: the first row is the header (column names), each subsequent row is
5//! a record, and every cell maps to the closed [`Value`] set — so all detectors
6//! apply with no special-casing. A date/time cell keeps its Excel serial number
7//! (numeric, deterministic); blanks and error cells are `Null` (honest absence).
8//!
9//! Reading is delegated to `calamine` (pure Rust, all four formats). Detected by
10//! the ZIP magic plus an `xl/` or OpenDocument-spreadsheet marker; extensions
11//! `.xlsx`/`.xls`/`.xlsb`/`.ods`. Behind the default-on `xlsx` feature.
12
13use crate::parser::{Confidence, FormatParser, MAGIC};
14use crate::table::TableBuilder;
15use ax_core::{AxError, Column, Value};
16use calamine::{open_workbook_auto_from_rs, Data, Reader};
17use std::collections::BTreeMap;
18use std::io::Cursor;
19
20#[derive(Debug, Default, Clone)]
21pub struct XlsxParser;
22
23/// True if `needle` appears anywhere in `haystack`.
24fn contains_seq(haystack: &[u8], needle: &[u8]) -> bool {
25    haystack.windows(needle.len()).any(|w| w == needle)
26}
27
28/// Maps a calamine cell to the closed [`Value`] set.
29fn data_to_value(cell: &Data) -> Value {
30    match cell {
31        Data::Int(i) => Value::Int(*i),
32        Data::Float(f) => {
33            if f.is_finite() {
34                Value::Float(*f)
35            } else {
36                Value::Null
37            }
38        }
39        Data::String(s) => Value::Str(s.clone()),
40        Data::Bool(b) => Value::Bool(*b),
41        // Keep the Excel serial number — numeric and deterministic.
42        Data::DateTime(dt) => Value::Float(dt.as_f64()),
43        Data::DateTimeIso(s) | Data::DurationIso(s) => Value::Str(s.clone()),
44        Data::Error(_) | Data::Empty => Value::Null,
45    }
46}
47
48/// The column name for a header cell: a non-empty string cell verbatim, else a
49/// positional `col{index}`.
50fn header_name(cell: &Data, index: usize) -> String {
51    match cell {
52        Data::String(s) if !s.trim().is_empty() => s.trim().to_string(),
53        _ => format!("col{index}"),
54    }
55}
56
57impl XlsxParser {
58    fn err(&self, msg: impl std::fmt::Display) -> AxError {
59        AxError::Parse {
60            format: self.id().to_string(),
61            message: msg.to_string(),
62        }
63    }
64}
65
66impl FormatParser for XlsxParser {
67    fn id(&self) -> &'static str {
68        "xlsx"
69    }
70    fn extensions(&self) -> &'static [&'static str] {
71        &["xlsx", "xls", "xlsb", "ods"]
72    }
73    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
74        if !bytes.starts_with(b"PK\x03\x04") {
75            return None; // not a ZIP (xls's OLE2 magic is handled by extension)
76        }
77        // A ZIP that is specifically a spreadsheet: xlsx/xlsb have an `xl/` part,
78        // ODS declares the OpenDocument-spreadsheet mimetype. (A docx/jar/plain
79        // zip has neither, so it is not claimed.)
80        (contains_seq(bytes, b"xl/") || contains_seq(bytes, b"opendocument.spreadsheet"))
81            .then_some(MAGIC)
82    }
83    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
84        let mut workbook =
85            open_workbook_auto_from_rs(Cursor::new(bytes.to_vec())).map_err(|e| self.err(e))?;
86        let sheet = workbook
87            .sheet_names()
88            .first()
89            .cloned()
90            .ok_or_else(|| self.err("workbook has no sheets"))?;
91        let range = workbook.worksheet_range(&sheet).map_err(|e| self.err(e))?;
92
93        let mut rows = range.rows();
94        let Some(header) = rows.next() else {
95            return Ok(Vec::new()); // empty sheet → no columns
96        };
97        let names: Vec<String> = header
98            .iter()
99            .enumerate()
100            .map(|(i, cell)| header_name(cell, i))
101            .collect();
102
103        let mut builder = TableBuilder::new();
104        for row in rows {
105            let mut record: BTreeMap<String, Value> = BTreeMap::new();
106            for (name, cell) in names.iter().zip(row) {
107                record.insert(name.clone(), data_to_value(cell));
108            }
109            builder.push_row(record);
110        }
111        Ok(builder.finish())
112    }
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118    use calamine::{CellErrorType, ExcelDateTime, ExcelDateTimeType};
119    use rust_xlsxwriter::Workbook;
120
121    /// Writes a tiny .xlsx in-memory: header + two records (string / number / bool).
122    fn build_xlsx() -> Vec<u8> {
123        let mut wb = Workbook::new();
124        let ws = wb.add_worksheet();
125        for (c, h) in ["name", "score", "active"].iter().enumerate() {
126            ws.write(0, c as u16, *h).unwrap();
127        }
128        ws.write(1, 0, "alice").unwrap();
129        ws.write(1, 1, 95).unwrap();
130        ws.write(1, 2, true).unwrap();
131        ws.write(2, 0, "bob").unwrap();
132        ws.write(2, 1, 42.5).unwrap();
133        ws.write(2, 2, false).unwrap();
134        wb.save_to_buffer().unwrap()
135    }
136
137    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
138        cols.iter()
139            .find(|c| c.name == name)
140            .unwrap_or_else(|| panic!("missing column {name}"))
141    }
142
143    #[test]
144    fn roundtrip_first_sheet_to_records() {
145        let bytes = build_xlsx();
146        let cols = XlsxParser.parse("book.xlsx", &bytes).unwrap();
147        assert_eq!(
148            col(&cols, "name").cells,
149            vec![Value::Str("alice".into()), Value::Str("bob".into())]
150        );
151        let score = col(&cols, "score");
152        assert_eq!(score.numeric(), vec![95.0, 42.5]);
153        assert_eq!(
154            col(&cols, "active").cells,
155            vec![Value::Bool(true), Value::Bool(false)]
156        );
157    }
158
159    #[test]
160    fn data_to_value_units() {
161        assert_eq!(data_to_value(&Data::Int(7)), Value::Int(7));
162        assert_eq!(data_to_value(&Data::Float(1.5)), Value::Float(1.5));
163        assert_eq!(data_to_value(&Data::Float(f64::NAN)), Value::Null); // non-finite → null
164        assert_eq!(
165            data_to_value(&Data::String("x".into())),
166            Value::Str("x".into())
167        );
168        assert_eq!(data_to_value(&Data::Bool(true)), Value::Bool(true));
169        assert_eq!(data_to_value(&Data::Empty), Value::Null);
170        assert_eq!(
171            data_to_value(&Data::Error(CellErrorType::Div0)),
172            Value::Null
173        );
174        assert_eq!(
175            data_to_value(&Data::DateTimeIso("2021-01-01".into())),
176            Value::Str("2021-01-01".into())
177        );
178        // A date cell keeps its Excel serial number.
179        let dt = ExcelDateTime::new(44197.0, ExcelDateTimeType::DateTime, false);
180        assert_eq!(data_to_value(&Data::DateTime(dt)), Value::Float(44197.0));
181    }
182
183    #[test]
184    fn header_name_units() {
185        assert_eq!(header_name(&Data::String("score".into()), 1), "score");
186        assert_eq!(header_name(&Data::String("  ".into()), 1), "col1"); // blank → positional
187        assert_eq!(header_name(&Data::Empty, 2), "col2");
188        assert_eq!(header_name(&Data::Int(5), 0), "col0"); // non-string → positional
189    }
190
191    #[test]
192    fn malformed_input_errors() {
193        assert!(matches!(
194            XlsxParser.parse("book.xlsx", b"not a spreadsheet"),
195            Err(AxError::Parse { .. })
196        ));
197        // ZIP magic but not a valid workbook.
198        assert!(matches!(
199            XlsxParser.parse("book.xlsx", b"PK\x03\x04 garbage"),
200            Err(AxError::Parse { .. })
201        ));
202    }
203
204    #[test]
205    fn sniff_keys_on_zip_plus_spreadsheet_marker() {
206        assert_eq!(XlsxParser.sniff(&build_xlsx()), Some(MAGIC));
207        // A ZIP that is not a spreadsheet (e.g. a .docx has `word/`, not `xl/`).
208        assert_eq!(XlsxParser.sniff(b"PK\x03\x04....word/document.xml"), None);
209        assert_eq!(XlsxParser.sniff(b"not a zip"), None);
210        assert_eq!(XlsxParser.sniff(b"PK"), None); // too short for the magic
211    }
212
213    #[test]
214    fn contains_seq_units() {
215        assert!(contains_seq(b"hello xl/ world", b"xl/"));
216        assert!(!contains_seq(b"hello world", b"xl/"));
217        assert!(!contains_seq(b"ab", b"abc")); // needle longer than haystack
218    }
219
220    #[test]
221    fn claims_spreadsheet_extensions() {
222        assert_eq!(XlsxParser.extensions(), &["xlsx", "xls", "xlsb", "ods"]);
223    }
224
225    #[test]
226    fn resolves_by_extension_and_magic() {
227        let reg = crate::parser::ParserRegistry::default();
228        assert_eq!(reg.resolve("book.xlsx", b"zz").unwrap().id(), "xlsx");
229        assert_eq!(reg.resolve("sheet.ods", b"zz").unwrap().id(), "xlsx");
230        assert_eq!(reg.resolve("-", &build_xlsx()).unwrap().id(), "xlsx");
231    }
232}