Skip to main content

wolfxl_core/
sheet.rs

1use std::fs::File;
2use std::io::BufReader;
3
4use calamine_styles::{Data, Reader, Sheets};
5use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
6
7use crate::cell::{Cell, CellValue};
8use crate::error::{Error, Result};
9use crate::workbook::WorkbookStyles;
10
11/// The calamine-styles reader bundle dispatch-wraps Xlsx/Xls/Xlsb/Ods
12/// behind a single enum. All four implement the `Reader` trait, so
13/// `worksheet_range` and `worksheet_style` work uniformly — xls/xlsb/ods
14/// return an empty `StyleRange` (styles walker is xlsx-only), which
15/// is the expected behavior.
16pub(crate) type SheetsReader = Sheets<BufReader<File>>;
17
18pub struct Sheet {
19    pub name: String,
20    rows: Vec<Vec<Cell>>,
21}
22
23impl Sheet {
24    pub(crate) fn load(
25        wb: &mut SheetsReader,
26        name: &str,
27        mut styles: Option<&mut WorkbookStyles>,
28    ) -> Result<Self> {
29        let value_range = wb
30            .worksheet_range(name)
31            .map_err(|e| Error::Xlsx(format!("read range for {name:?}: {e}")))?;
32
33        let style_range = wb.worksheet_style(name).ok();
34
35        // Pre-populate the per-cell styleId map once so we don't re-walk
36        // the worksheet XML per cell. Failure here (e.g. missing sheet
37        // part in the zip) degrades gracefully to the calamine-only path.
38        if let Some(s) = styles.as_mut() {
39            let _ = s.sheet_style_ids_mut(name);
40        }
41
42        let (h, w) = value_range.get_size();
43        let (start_row, start_col) = value_range.start().unwrap_or((0, 0));
44        let mut rows: Vec<Vec<Cell>> = Vec::with_capacity(h);
45        for r in 0..h {
46            let mut row: Vec<Cell> = Vec::with_capacity(w);
47            for c in 0..w {
48                let value = value_range
49                    .get((r, c))
50                    .map(data_to_cell_value)
51                    .unwrap_or(CellValue::Empty);
52                let absolute_position = absolute_position(start_row, start_col, r, c);
53                let number_format = style_range
54                    .as_ref()
55                    .and_then(|sr| {
56                        let (row, col) = absolute_position?;
57                        style_at_absolute_position(sr, row, col)
58                    })
59                    .and_then(extract_number_format)
60                    .or_else(|| {
61                        // Calamine fast path missed. Fall back to the
62                        // cellXfs walker for openpyxl-style workbooks
63                        // where Style::get_number_format returns None.
64                        styles
65                            .as_ref()
66                            .and_then(|s| walker_number_format(s, name, start_row, start_col, r, c))
67                    });
68                row.push(Cell {
69                    value,
70                    number_format,
71                });
72            }
73            rows.push(row);
74        }
75
76        Ok(Self {
77            name: name.to_string(),
78            rows,
79        })
80    }
81
82    pub fn dimensions(&self) -> (usize, usize) {
83        let h = self.rows.len();
84        let w = self.rows.first().map(|r| r.len()).unwrap_or(0);
85        (h, w)
86    }
87
88    /// Test-only constructor: build a `Sheet` from a pre-shaped grid without
89    /// round-tripping through xlsx. Lets crate-internal tests (e.g. the
90    /// classifier in `map.rs`) cover branches the committed fixtures don't
91    /// exercise.
92    #[cfg(test)]
93    pub(crate) fn from_rows_for_test(name: &str, rows: Vec<Vec<Cell>>) -> Self {
94        Self {
95            name: name.to_string(),
96            rows,
97        }
98    }
99
100    /// Build a `Sheet` from a pre-shaped grid. Used by the CSV backend
101    /// internally; also public so third-party callers (notably the
102    /// PyO3 bridge in the sibling `wolfxl` cdylib) can feed externally-
103    /// sourced rows through `infer_sheet_schema` / `classify_sheet`
104    /// without reading from disk. No styles / number formats are
105    /// attached - callers with that information should set
106    /// `Cell::number_format` on the cells they build.
107    pub fn from_rows(name: String, rows: Vec<Vec<Cell>>) -> Self {
108        Self { name, rows }
109    }
110
111    pub fn rows(&self) -> &[Vec<Cell>] {
112        &self.rows
113    }
114
115    pub fn row(&self, idx: usize) -> Option<&[Cell]> {
116        self.rows.get(idx).map(|r| r.as_slice())
117    }
118
119    /// First row stringified - the conventional "header" row for table-shaped
120    /// sheets. Empty cells become empty strings so position is preserved.
121    pub fn headers(&self) -> Vec<String> {
122        self.rows
123            .first()
124            .map(|row| {
125                row.iter()
126                    .map(|c| match &c.value {
127                        CellValue::String(s) => s.clone(),
128                        CellValue::Empty => String::new(),
129                        other => format_value_plain(other),
130                    })
131                    .collect()
132            })
133            .unwrap_or_default()
134    }
135}
136
137fn absolute_position(
138    start_row: u32,
139    start_col: u32,
140    row_offset: usize,
141    col_offset: usize,
142) -> Option<(u32, u32)> {
143    let row = start_row.checked_add(u32::try_from(row_offset).ok()?)?;
144    let col = start_col.checked_add(u32::try_from(col_offset).ok()?)?;
145    Some((row, col))
146}
147
148fn style_at_absolute_position(
149    range: &calamine_styles::StyleRange,
150    row: u32,
151    col: u32,
152) -> Option<&calamine_styles::Style> {
153    let (start_row, start_col) = range.start()?;
154    if row < start_row || col < start_col {
155        return None;
156    }
157    let rel_row = usize::try_from(row - start_row).ok()?;
158    let rel_col = usize::try_from(col - start_col).ok()?;
159    range.get((rel_row, rel_col))
160}
161
162fn format_value_plain(v: &CellValue) -> String {
163    match v {
164        CellValue::Empty => String::new(),
165        CellValue::String(s) => s.clone(),
166        CellValue::Int(n) => n.to_string(),
167        CellValue::Float(n) => n.to_string(),
168        CellValue::Bool(b) => if *b { "TRUE" } else { "FALSE" }.to_string(),
169        CellValue::Date(d) => d.format("%Y-%m-%d").to_string(),
170        CellValue::DateTime(dt) => dt.format("%Y-%m-%d %H:%M:%S").to_string(),
171        CellValue::Time(t) => t.format("%H:%M:%S").to_string(),
172        CellValue::Error(e) => e.clone(),
173    }
174}
175
176fn data_to_cell_value(d: &Data) -> CellValue {
177    match d {
178        Data::Empty => CellValue::Empty,
179        Data::String(s) => CellValue::String(s.clone()),
180        Data::Int(i) => CellValue::Int(*i),
181        Data::Float(f) => {
182            if f.fract() == 0.0 && f.abs() < (i64::MAX as f64) {
183                CellValue::Int(*f as i64)
184            } else {
185                CellValue::Float(*f)
186            }
187        }
188        Data::Bool(b) => CellValue::Bool(*b),
189        Data::DateTime(dt) => excel_serial_to_datetime(dt.as_f64()),
190        Data::DateTimeIso(s) => parse_iso_datetime_or_string(s),
191        Data::DurationIso(s) => CellValue::String(s.clone()),
192        Data::Error(e) => CellValue::Error(format!("{e:?}")),
193        Data::RichText(rt) => CellValue::String(rt.plain_text().to_string()),
194    }
195}
196
197/// Excel serial date → chrono. Sub-day fractions become Time; ≥1.0 with no
198/// fractional part becomes Date; otherwise DateTime. Serial 0.0 is the Excel
199/// epoch but for time-formatted cells means midnight; route it to Time(0,0,0)
200/// to match openpyxl rather than returning the epoch date.
201fn excel_serial_to_datetime(serial: f64) -> CellValue {
202    if serial < 1.0 && serial >= 0.0 {
203        let mut secs = (serial * 86_400.0).round() as u32;
204        // 0.99999999 rounds to 86_400, which makes h=24 and `from_hms_opt`
205        // returns None — without this carry, the prior fallback emitted
206        // `CellValue::Float(serial)` and silently demoted a time-typed
207        // cell to a numeric. Mirror the day-carry the date+time branch
208        // does below: for a pure sub-day value, "next midnight" is just
209        // 00:00:00.
210        if secs >= 86_400 {
211            secs -= 86_400;
212        }
213        let h = secs / 3600;
214        let m = (secs % 3600) / 60;
215        let s = secs % 60;
216        return NaiveTime::from_hms_opt(h, m, s)
217            .map(CellValue::Time)
218            .unwrap_or_else(|| CellValue::Float(serial));
219    }
220    let mut days = serial.trunc() as i64;
221    let frac = serial - (days as f64);
222    // Excel's 1900 leap-year bug: serial 60 maps to the non-existent
223    // 1900-02-29. openpyxl uses base 1899-12-30 (instead of 1899-12-31) to
224    // dodge the bug for serials >= 60, but that leaves serials 1..59 off by
225    // one day. The +1 correction restores serial 1 -> 1900-01-01 etc., which
226    // matches openpyxl.utils.datetime.from_excel.
227    if serial > 0.0 && serial < 60.0 {
228        days += 1;
229    }
230    if frac.abs() < f64::EPSILON {
231        return CellValue::Date(days_to_date_from_excel_base(days));
232    }
233    // Keep the day-fraction arithmetic signed until normalized into
234    // [0, 86_400) — a negative serial like -0.5 produces frac = -0.5 here,
235    // and the prior `(frac * 86_400).round() as u32` would wrap a negative
236    // f64 to a huge positive u32, then the next-day carry branch would
237    // emit a corrupted pre-1900 datetime. Borrow whole days off `days`
238    // until secs lands in the valid range; then carry forward the same
239    // way the existing 0.99999999 → 86_400 case does.
240    let mut secs_signed = (frac * 86_400.0).round() as i64;
241    if secs_signed < 0 {
242        let borrow_days = (-secs_signed + 86_399) / 86_400; // ceil division
243        secs_signed += borrow_days * 86_400;
244        days -= borrow_days;
245    } else if secs_signed >= 86_400 {
246        let carry_days = secs_signed / 86_400;
247        secs_signed -= carry_days * 86_400;
248        days += carry_days;
249    }
250    let secs = secs_signed as u32; // now in [0, 86_400)
251    let date = days_to_date_from_excel_base(days);
252    let h = secs / 3600;
253    let m = (secs % 3600) / 60;
254    let s = secs % 60;
255    let time = NaiveTime::from_hms_opt(h, m, s)
256        .unwrap_or_else(|| NaiveTime::from_hms_opt(0, 0, 0).unwrap());
257    CellValue::DateTime(NaiveDateTime::new(date, time))
258}
259
260fn days_to_date_from_excel_base(days: i64) -> NaiveDate {
261    let base = NaiveDate::from_ymd_opt(1899, 12, 30).expect("static date");
262    if days >= 0 {
263        base.checked_add_days(chrono::Days::new(days as u64))
264    } else {
265        // u64 cast on negative i64 wraps; subtract the absolute value.
266        base.checked_sub_days(chrono::Days::new((-days) as u64))
267    }
268    .unwrap_or(base)
269}
270
271fn parse_iso_datetime_or_string(s: &str) -> CellValue {
272    if let Ok(dt) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") {
273        return CellValue::DateTime(dt);
274    }
275    if let Ok(dt) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
276        return CellValue::DateTime(dt);
277    }
278    if let Ok(d) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
279        return CellValue::Date(d);
280    }
281    CellValue::String(s.to_string())
282}
283
284/// Pull the resolved format-code string off a calamine-styles `Style`. The
285/// upstream crate handles built-in vs custom (>=164) resolution; we just
286/// normalize the result and skip the no-op "General".
287fn extract_number_format(style: &calamine_styles::Style) -> Option<String> {
288    let nf = style.get_number_format()?;
289    let code = nf.format_code.trim();
290    if code.is_empty() || code.eq_ignore_ascii_case("general") {
291        None
292    } else {
293        Some(code.to_string())
294    }
295}
296
297/// Walker fallback: look up the cell's styleId in the pre-parsed map and
298/// resolve it against cellXfs + numFmts. Returns `None` when the cell has
299/// no style override (the common case) or when the referenced format is
300/// `General` / absent.
301fn walker_number_format(
302    styles: &WorkbookStyles,
303    sheet_name: &str,
304    start_row: u32,
305    start_col: u32,
306    r: usize,
307    c: usize,
308) -> Option<String> {
309    let row = start_row.checked_add(u32::try_from(r).ok()?)?;
310    let col = start_col.checked_add(u32::try_from(c).ok()?)?;
311    let style_id = styles
312        .sheet_style_ids(sheet_name)?
313        .get(&(row, col))
314        .copied()?;
315    styles
316        .number_format_for_style_id(style_id)
317        .map(|s| s.to_string())
318}
319
320#[cfg(test)]
321mod tests {
322    use chrono::Datelike;
323
324    use super::*;
325
326    fn date(value: CellValue) -> NaiveDate {
327        match value {
328            CellValue::Date(d) => d,
329            other => panic!("expected Date, got {other:?}"),
330        }
331    }
332
333    #[test]
334    fn absolute_position_overflow_returns_none() {
335        assert_eq!(absolute_position(10, 20, 2, 3), Some((12, 23)));
336        assert_eq!(absolute_position(u32::MAX, 20, 1, 0), None);
337        assert_eq!(absolute_position(10, u32::MAX, 0, 1), None);
338        assert_eq!(absolute_position(10, 20, usize::MAX, 0), None);
339    }
340
341    #[test]
342    fn excel_serial_matches_openpyxl_for_pre_leap_serials() {
343        // openpyxl maps serial 1 -> 1900-01-01 thanks to its +1 correction
344        // for serials in (0, 60). Serial 59 -> 1900-02-28.
345        assert_eq!(
346            date(excel_serial_to_datetime(1.0)),
347            NaiveDate::from_ymd_opt(1900, 1, 1).unwrap()
348        );
349        assert_eq!(
350            date(excel_serial_to_datetime(59.0)),
351            NaiveDate::from_ymd_opt(1900, 2, 28).unwrap()
352        );
353        // Serial 61 -> 1900-03-01 (Excel's fake serial-60 leap day is skipped).
354        assert_eq!(
355            date(excel_serial_to_datetime(61.0)),
356            NaiveDate::from_ymd_opt(1900, 3, 1).unwrap()
357        );
358        // A modern serial: 44197 -> 2021-01-01.
359        assert_eq!(
360            date(excel_serial_to_datetime(44197.0)),
361            NaiveDate::from_ymd_opt(2021, 1, 1).unwrap()
362        );
363    }
364
365    #[test]
366    fn excel_serial_negative_does_not_wrap() {
367        // Bad/sentinel serials shouldn't panic or produce a date in the
368        // far future via u64 wrap. Fall back to the epoch.
369        let value = excel_serial_to_datetime(-100.0);
370        let d = date(value);
371        assert!(d.year() < 1900, "got {d}");
372    }
373
374    #[test]
375    fn excel_serial_sub_day_near_midnight_carries_to_zero_time() {
376        // 0.99999999 rounds to 86_400 secs (h=24 is invalid). The prior
377        // fallback emitted CellValue::Float(serial), silently demoting a
378        // time-typed cell to a numeric. The carry should land on
379        // Time(00:00:00) — equivalent of "next midnight" with no date to
380        // carry into.
381        let value = excel_serial_to_datetime(0.99999999);
382        match value {
383            CellValue::Time(t) => {
384                assert_eq!(t, NaiveTime::from_hms_opt(0, 0, 0).unwrap());
385            }
386            other => panic!("expected Time(00:00:00), got {other:?}"),
387        }
388    }
389
390    #[test]
391    fn excel_serial_zero_returns_midnight_time() {
392        // Serial 0 is the Excel epoch (1899-12-30) but for time-formatted
393        // cells means midnight. openpyxl returns Time(0,0,0) here; we
394        // match that rather than emitting the epoch date.
395        let value = excel_serial_to_datetime(0.0);
396        match value {
397            CellValue::Time(t) => {
398                assert_eq!(t, NaiveTime::from_hms_opt(0, 0, 0).unwrap());
399            }
400            other => panic!("expected Time(00:00:00), got {other:?}"),
401        }
402    }
403
404    #[test]
405    fn excel_serial_negative_fractional_borrows_into_prior_day() {
406        // Serial -0.5 means "12:00 the day before 1899-12-30", i.e.
407        // 1899-12-29 12:00:00. The prior code computed
408        // `(frac * 86_400).round() as u32` where frac was -0.5; the
409        // negative→u32 cast wrapped to a huge positive, and the
410        // "carry into next day" branch then emitted a corrupted
411        // far-future datetime. Signed arithmetic with a borrow keeps
412        // the result in chrono's representable range and on the
413        // correct calendar day.
414        let value = excel_serial_to_datetime(-0.5);
415        match value {
416            CellValue::DateTime(dt) => {
417                assert_eq!(
418                    dt.date(),
419                    NaiveDate::from_ymd_opt(1899, 12, 29).unwrap(),
420                    "expected borrow into prior day, got {dt}",
421                );
422                assert_eq!(dt.time(), NaiveTime::from_hms_opt(12, 0, 0).unwrap());
423            }
424            other => panic!("expected DateTime, got {other:?}"),
425        }
426    }
427
428    #[test]
429    fn excel_serial_carries_near_midnight_fraction_to_next_day() {
430        // 44197 + 0.99999999 rounds up to 86_400 secs in the day-fraction
431        // calc; that must carry into 2021-01-02 00:00:00 instead of clamping
432        // to 23:00:00 on 2021-01-01.
433        let value = excel_serial_to_datetime(44197.0 + 0.99999999);
434        match value {
435            CellValue::DateTime(dt) => {
436                assert_eq!(dt.date(), NaiveDate::from_ymd_opt(2021, 1, 2).unwrap(),);
437                assert_eq!(dt.time(), NaiveTime::from_hms_opt(0, 0, 0).unwrap());
438            }
439            other => panic!("expected DateTime, got {other:?}"),
440        }
441    }
442}