Skip to main content

xl3_core/
source.rs

1//! Source data reader. Reads the data workbook (`data.xlsx`) into row
2//! records keyed by the header row.
3//!
4//! Phase 1 P1-A scope: single source per workbook (the one named by
5//! `__config__.source_sheet`). Multi-source declaration via `__sources__`
6//! lands later.
7
8use std::collections::HashMap;
9use std::io::Cursor;
10use std::path::Path;
11
12use anyhow::{anyhow, bail, Context, Result};
13
14use crate::calamine::{Data as CData, Reader, Xlsx};
15use crate::plan::SourceTable;
16use crate::styles::{self, NumFmtKind, TemplateStyles};
17use crate::value::Value;
18
19#[derive(Debug, Clone)]
20pub struct SourceData {
21    pub name: String,
22    pub headers: Vec<String>,
23    pub rows: Vec<HashMap<String, Value>>,
24}
25
26pub trait SourceReader {
27    fn read(&mut self, sheet: &str, table: &SourceTable) -> Result<SourceData>;
28}
29
30/// Default `SourceReader` backed by `calamine`. Assumes the source is
31/// laid out as a single header row at row 1, followed by data rows.
32/// (xl3 0.x's `source_table = 1` convention.) The workbook is held as
33/// an in-memory byte buffer so the styles pass can re-open the same
34/// archive cheaply — and so a `Vec<u8>` host (browser/Node) and a
35/// file host share the exact same code path.
36pub struct CalamineSourceReader {
37    workbook: Xlsx<Cursor<Vec<u8>>>,
38    bytes: Vec<u8>,
39    /// Source workbook styles — only parsed lazily on first need.
40    /// Used to render numeric cells whose numFmt is a date format as
41    /// canonical ISO strings (ADR-0017).
42    styles: Option<TemplateStyles>,
43}
44
45impl CalamineSourceReader {
46    pub fn open(path: &Path) -> Result<Self> {
47        let bytes = std::fs::read(path)
48            .with_context(|| format!("read data workbook at {}", path.display()))?;
49        Self::open_bytes(bytes)
50    }
51
52    /// In-memory variant — the WASM `convert()` entry point feeds the
53    /// data workbook in this way.
54    pub fn open_bytes(bytes: Vec<u8>) -> Result<Self> {
55        let cursor = Cursor::new(bytes.clone());
56        let workbook: Xlsx<_> =
57            Xlsx::new(cursor).context("open data workbook from bytes")?;
58        Ok(CalamineSourceReader {
59            workbook,
60            bytes,
61            styles: None,
62        })
63    }
64
65    fn ensure_styles(&mut self) -> &TemplateStyles {
66        if self.styles.is_none() {
67            self.styles =
68                Some(styles::parse_template_styles_bytes(&self.bytes).unwrap_or_default());
69        }
70        self.styles.as_ref().unwrap()
71    }
72
73    /// Convenience: when only one source is named in `__config__`, load
74    /// the first non-empty sheet.
75    pub fn first_sheet(&self) -> Option<String> {
76        self.workbook.sheet_names().into_iter().next()
77    }
78
79    /// Resolve a `source_sheet` configuration value to an actual sheet
80    /// name in the workbook. Accepts:
81    /// - exact name (`"Data"`) — returned as-is iff present
82    /// - prefix glob (`"Data_*"`) — first sheet in workbook order whose
83    ///   name starts with the literal prefix (xl3 evaluation.md
84    ///   "Source Data Model")
85    pub fn resolve_sheet_name(&self, pattern: &str) -> Option<String> {
86        if let Some(prefix) = pattern.strip_suffix('*') {
87            self.workbook
88                .sheet_names()
89                .into_iter()
90                .find(|n| n.starts_with(prefix))
91        } else {
92            let pattern = pattern.to_string();
93            self.workbook
94                .sheet_names()
95                .into_iter()
96                .find(|n| n == &pattern)
97        }
98    }
99}
100
101/// True if a value should be treated as blank.
102/// Matches xl3's ADR-0007: explicit `Empty` plus strings that contain
103/// only whitespace. Other types — numbers (including 0), booleans
104/// (including `false`) — are NOT blank.
105///
106/// Used by:
107/// - source reader (skip blank rows)
108/// - `COUNT([Field])` row aggregate (only non-blank values count)
109/// - any future code that needs the same notion of "missing data"
110pub fn is_blank_value(v: &Value) -> bool {
111    match v {
112        Value::Empty => true,
113        Value::String(s) => s.chars().all(|c| c.is_whitespace()),
114        _ => false,
115    }
116}
117
118impl SourceReader for CalamineSourceReader {
119    fn read(&mut self, sheet: &str, table: &SourceTable) -> Result<SourceData> {
120        // Styles & merge info — pulled before worksheet_range so the
121        // two `&mut self.workbook` borrows don't overlap.
122        self.ensure_styles();
123        let styles = self.styles.clone().unwrap_or_default();
124        let header_merges = self
125            .workbook
126            .worksheet_merge_cells(sheet)
127            .and_then(|r| r.ok())
128            .unwrap_or_default();
129
130        let range = self
131            .workbook
132            .worksheet_range(sheet)
133            .with_context(|| format!("read source sheet {sheet:?}"))?;
134
135        if range.get_size() == (0, 0) {
136            return Ok(SourceData {
137                name: sheet.to_string(),
138                headers: vec![],
139                rows: vec![],
140            });
141        }
142
143        // calamine returns a `Range` whose `(0, 0)` is the first *used*
144        // cell, not sheet A1. The xl3 conventions (and the SourceTable
145        // values we receive) are in absolute 1-based A1 coordinates, so
146        // we read via `Range::get_value(absolute)` and never use the
147        // relative `get`. `end()` gives the absolute bottom-right (also
148        // 0-based) so we know how far down to walk.
149        let (last_row_abs, last_col_abs) = range
150            .end()
151            .map(|(r, c)| (r as usize, c as usize))
152            .unwrap_or((0, 0));
153
154        // Resolve the (header_row, data_row_range, col_range) tuple
155        // from the SourceTable. All indices below are absolute, 0-based.
156        let (header_row, data_first, data_last_excl, col_first, col_last_excl) = match table {
157            SourceTable::HeaderRow(n) => {
158                let header = n.saturating_sub(1);
159                let row_end_excl = last_row_abs + 1;
160                let col_end_excl = last_col_abs + 1;
161                (header, header + 1, row_end_excl, 0usize, col_end_excl)
162            }
163            SourceTable::Range {
164                first_row,
165                last_row,
166                first_col,
167                last_col,
168            } => {
169                let header = first_row.saturating_sub(1);
170                let data_first = header + 1;
171                let data_last_excl = match last_row {
172                    Some(lr) => (*lr).min(last_row_abs + 1),
173                    None => last_row_abs + 1,
174                };
175                let col_first0 = first_col.saturating_sub(1);
176                let col_last_excl0 = match last_col {
177                    Some(lc) => (*lc).min(last_col_abs + 1),
178                    None => last_col_abs + 1,
179                };
180                (header, data_first, data_last_excl, col_first0, col_last_excl0)
181            }
182        };
183
184        if header_row > last_row_abs {
185            return Err(anyhow!(
186                "source sheet {sheet:?} header row {} is past the last used row {}",
187                header_row + 1,
188                last_row_abs + 1
189            ));
190        }
191
192        // Reverse map: any cell *inside* a merge that isn't the master
193        // borrows the master's value (ADR-0033 vertical slave rule).
194        let slave_to_master: HashMap<(usize, usize), (u32, u32)> = header_merges
195            .iter()
196            .flat_map(|d| {
197                let master = (d.start.0, d.start.1);
198                let mut out: Vec<((usize, usize), (u32, u32))> = Vec::new();
199                for r in d.start.0..=d.end.0 {
200                    for c in d.start.1..=d.end.1 {
201                        if (r, c) != master {
202                            out.push(((r as usize, c as usize), master));
203                        }
204                    }
205                }
206                out
207            })
208            .collect();
209        // Per-master end column lookup — used to jump past all
210        // horizontal slave columns once we've emitted a master.
211        let master_end_col: HashMap<(u32, u32), usize> = header_merges
212            .iter()
213            .map(|d| ((d.start.0, d.start.1), d.end.1 as usize))
214            .collect();
215        let cell_at = |r: usize, c: usize| -> Option<&CData> {
216            if let Some(&(mr, mc)) = slave_to_master.get(&(r, c)) {
217                range.get_value((mr, mc))
218            } else {
219                range.get_value((r as u32, c as u32))
220            }
221        };
222
223        // Header span: cells in (header_row, col_first..col_last_excl)
224        // up until the first blank. A merged-header master claims its
225        // whole run as one logical column.
226        let mut headers: Vec<String> = Vec::new();
227        let mut header_cols: Vec<usize> = Vec::new();
228        let mut c = col_first;
229        let mut seen_masters: std::collections::HashSet<(u32, u32)> =
230            std::collections::HashSet::new();
231        while c < col_last_excl {
232            // The effective master position for (header_row, c): if the
233            // cell is a merge slave, follow it to the master; otherwise
234            // the cell is its own master.
235            let master_pos = slave_to_master
236                .get(&(header_row, c))
237                .copied()
238                .unwrap_or((header_row as u32, c as u32));
239
240            // ADR-0033: a master claims its whole horizontal run as one
241            // logical column. Subsequent slave columns sharing the same
242            // master must not contribute additional headers.
243            if !seen_masters.insert(master_pos) {
244                c += 1;
245                continue;
246            }
247
248            let cell = range.get_value(master_pos);
249            match cell {
250                Some(CData::String(s)) if !s.is_empty() => {
251                    headers.push(s.clone());
252                    header_cols.push(c);
253                    // Jump past the merge's horizontal extent so we
254                    // resume scanning at the next logical column.
255                    if let Some(&end_col) = master_end_col.get(&master_pos) {
256                        c = end_col + 1;
257                    } else {
258                        c += 1;
259                    }
260                }
261                None | Some(CData::Empty) => break,
262                Some(other) => {
263                    bail!(
264                        "source header at column {c} is not a string: {other:?} (xl3 expects text headers)"
265                    );
266                }
267            }
268        }
269
270        if headers.is_empty() {
271            return Err(anyhow!("source sheet {sheet:?} has no header row"));
272        }
273
274        let mut data_rows = Vec::with_capacity(data_last_excl.saturating_sub(data_first));
275        for r in data_first..data_last_excl {
276            let mut record = HashMap::with_capacity(headers.len());
277            let mut row_blank = true;
278            for (i, header) in headers.iter().enumerate() {
279                let c = header_cols[i];
280                let raw = cell_at(r, c)
281                    .map(Value::from_calamine)
282                    .unwrap_or(Value::Empty);
283                // ADR-0017: a numeric source cell whose style is a date
284                // numFmt becomes Value::DateNumber. Arithmetic and
285                // comparison treat it like Number; only `canonical()`
286                // (used by string concat) renders the ISO form.
287                let v = match &raw {
288                    Value::Number(n) => {
289                        let kind = styles
290                            .format_code(sheet, r as u32, c as u32)
291                            .as_deref()
292                            .map(styles::classify_num_fmt)
293                            .unwrap_or(NumFmtKind::General);
294                        if kind == NumFmtKind::Date {
295                            Value::DateNumber(*n)
296                        } else {
297                            raw
298                        }
299                    }
300                    _ => raw,
301                };
302                if !is_blank_value(&v) {
303                    row_blank = false;
304                }
305                record.insert(header.clone(), v);
306            }
307            if row_blank {
308                continue;
309            }
310            data_rows.push(record);
311        }
312
313        Ok(SourceData {
314            name: sheet.to_string(),
315            headers,
316            rows: data_rows,
317        })
318    }
319}