Skip to main content

wolfxl_core/
workbook.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::path::{Path, PathBuf};
4
5use calamine_styles::{open_workbook_auto, Reader, Sheets};
6use zip::ZipArchive;
7
8use crate::csv_reader::CsvBackend;
9use crate::error::{Error, Result};
10use crate::map::{classify_sheet, SheetMap, WorkbookMap};
11use crate::ooxml::{
12    join_and_normalize, parse_relationship_targets, parse_workbook_sheet_rids, zip_read_to_string,
13    zip_read_to_string_opt,
14};
15use crate::sheet::{Sheet, SheetsReader};
16use crate::styles::{parse_cellxfs, parse_num_fmts, XfEntry};
17use crate::worksheet_xml::parse_cell_style_ids;
18
19/// Source format detected from the file extension. Drives which calamine
20/// backend (or CSV reader) handles the workbook and gates xlsx-only
21/// features like the styles walker and table parsing.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum SourceFormat {
24    Xlsx,
25    Xls,
26    Xlsb,
27    Ods,
28    Csv,
29}
30
31impl SourceFormat {
32    fn from_extension(path: &Path) -> Result<Self> {
33        let ext = path
34            .extension()
35            .and_then(|e| e.to_str())
36            .map(|s| s.to_ascii_lowercase());
37        match ext.as_deref() {
38            Some("xlsx" | "xlsm" | "xlam") => Ok(SourceFormat::Xlsx),
39            Some("xls" | "xla") => Ok(SourceFormat::Xls),
40            Some("xlsb") => Ok(SourceFormat::Xlsb),
41            Some("ods") => Ok(SourceFormat::Ods),
42            Some("csv" | "tsv" | "txt") => Ok(SourceFormat::Csv),
43            Some(other) => Err(Error::Format(format!(
44                "unsupported file extension: .{other} (supported: xlsx, xlsm, xlam, xls, xla, xlsb, ods, csv, tsv, txt)"
45            ))),
46            None => Err(Error::Format(
47                "cannot detect format: file has no extension".to_string(),
48            )),
49        }
50    }
51}
52
53/// Internal backend dispatch. Xlsx/Xls/Xlsb/Ods all flow through calamine's
54/// `Sheets` enum (which already abstracts them); CSV gets its own minimal
55/// value-only backend because it isn't a calamine-supported format.
56pub(crate) enum Backend {
57    Sheets(SheetsReader),
58    Csv(CsvBackend),
59}
60
61/// Pre-parsed style tables and worksheet-path lookup for a workbook.
62///
63/// Populated lazily on first sheet load and then shared across sheets.
64/// Each sheet's per-cell `(row, col) → styleId` map is populated on demand
65/// via [`WorkbookStyles::sheet_style_ids_mut`] to avoid walking every
66/// worksheet XML up-front on workbooks where only one sheet is read.
67pub struct WorkbookStyles {
68    cell_xfs: Vec<XfEntry>,
69    num_fmts: HashMap<u32, String>,
70    sheet_xml_paths: HashMap<String, String>,
71    per_sheet_style_ids: HashMap<String, HashMap<(u32, u32), u32>>,
72    zip_path: PathBuf,
73}
74
75impl WorkbookStyles {
76    fn load(zip_path: &Path) -> Result<Self> {
77        let file = File::open(zip_path)?;
78        let mut zip = ZipArchive::new(file)
79            .map_err(|e| Error::Xlsx(format!("failed to open xlsx zip: {e}")))?;
80
81        let styles_xml = zip_read_to_string_opt(&mut zip, "xl/styles.xml")?;
82        let cell_xfs = match styles_xml.as_deref() {
83            Some(xml) => parse_cellxfs(xml),
84            None => Vec::new(),
85        };
86        let num_fmts = match styles_xml.as_deref() {
87            Some(xml) => parse_num_fmts(xml)?,
88            None => HashMap::new(),
89        };
90
91        let workbook_xml = zip_read_to_string(&mut zip, "xl/workbook.xml")?;
92        let rels_xml = zip_read_to_string(&mut zip, "xl/_rels/workbook.xml.rels")?;
93        let sheet_rids = parse_workbook_sheet_rids(&workbook_xml)?;
94        let rel_targets = parse_relationship_targets(&rels_xml)?;
95        let mut sheet_xml_paths: HashMap<String, String> = HashMap::new();
96        for (name, rid) in sheet_rids {
97            if let Some(target) = rel_targets.get(&rid) {
98                sheet_xml_paths.insert(name, join_and_normalize("xl/", target));
99            }
100        }
101
102        Ok(Self {
103            cell_xfs,
104            num_fmts,
105            sheet_xml_paths,
106            per_sheet_style_ids: HashMap::new(),
107            zip_path: zip_path.to_path_buf(),
108        })
109    }
110
111    /// Resolve a styleId to a number-format string, consulting custom
112    /// numFmts first and the Excel built-in table second. Returns `None`
113    /// for styleId 0 (default), unknown IDs, or when the resolved code is
114    /// the no-op `"General"`.
115    pub fn number_format_for_style_id(&self, style_id: u32) -> Option<&str> {
116        if style_id == 0 {
117            return None;
118        }
119        let xf = self.cell_xfs.get(style_id as usize)?;
120        let code = crate::styles::resolve_num_fmt(xf.num_fmt_id, &self.num_fmts)?;
121        if code.trim().is_empty() || code.eq_ignore_ascii_case("General") {
122            None
123        } else {
124            Some(code)
125        }
126    }
127
128    /// Read-only access to the per-cell styleId map for a sheet. Returns
129    /// `None` until [`WorkbookStyles::sheet_style_ids_mut`] has populated
130    /// it. Used on the per-cell fast path where mutation would require
131    /// exclusive access.
132    pub fn sheet_style_ids(&self, sheet_name: &str) -> Option<&HashMap<(u32, u32), u32>> {
133        self.per_sheet_style_ids.get(sheet_name)
134    }
135
136    /// Lazily populate the per-cell styleId map for a sheet. Returns a
137    /// reference to the cached map. Reading the XML is the expensive part;
138    /// `&mut self` makes caching explicit.
139    pub fn sheet_style_ids_mut(&mut self, sheet_name: &str) -> Result<&HashMap<(u32, u32), u32>> {
140        if !self.per_sheet_style_ids.contains_key(sheet_name) {
141            let Some(path) = self.sheet_xml_paths.get(sheet_name).cloned() else {
142                self.per_sheet_style_ids
143                    .insert(sheet_name.to_string(), HashMap::new());
144                return Ok(self.per_sheet_style_ids.get(sheet_name).unwrap());
145            };
146            let file = File::open(&self.zip_path)?;
147            let mut zip = ZipArchive::new(file)
148                .map_err(|e| Error::Xlsx(format!("failed to open xlsx zip: {e}")))?;
149            let map = match zip_read_to_string_opt(&mut zip, &path)? {
150                Some(xml) => parse_cell_style_ids(&xml)?,
151                None => HashMap::new(),
152            };
153            self.per_sheet_style_ids.insert(sheet_name.to_string(), map);
154        }
155        Ok(self.per_sheet_style_ids.get(sheet_name).unwrap())
156    }
157
158    /// Test-only access to the parsed cellXfs table.
159    #[cfg(test)]
160    pub fn cell_xfs(&self) -> &[XfEntry] {
161        &self.cell_xfs
162    }
163}
164
165pub struct Workbook {
166    inner: Backend,
167    sheet_names: Vec<String>,
168    path: PathBuf,
169    format: SourceFormat,
170    styles: Option<WorkbookStyles>,
171}
172
173impl Workbook {
174    /// Open a workbook, dispatching to the right backend by file extension.
175    ///
176    /// Supported: `.xlsx` / `.xlsm` / `.xlam` (primary, full style resolution via
177    /// calamine fast-path + cellXfs walker), `.xls` / `.xla` / `.xlsb` / `.ods`
178    /// (values + defined names via calamine; styles come back empty -
179    /// calamine-styles doesn't parse them for these formats yet), and
180    /// `.csv` / `.tsv` / `.txt` (single synthetic sheet, value-only, schema
181    /// inference is the source of truth for column types).
182    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
183        let path = path.as_ref().to_path_buf();
184        let format = SourceFormat::from_extension(&path)?;
185
186        match format {
187            SourceFormat::Xlsx | SourceFormat::Xls | SourceFormat::Xlsb | SourceFormat::Ods => {
188                let mut inner: SheetsReader = open_workbook_auto(&path)
189                    .map_err(|e| Error::Xlsx(format!("failed to open workbook: {e}")))?;
190                // Tables only exist on xlsx; load_tables is xlsx-specific
191                // and panics later in `table_names_in_sheet` if skipped.
192                if let Sheets::Xlsx(ref mut x) = inner {
193                    let _ = x.load_tables();
194                }
195                let sheet_names = inner.sheet_names().to_vec();
196                Ok(Self {
197                    inner: Backend::Sheets(inner),
198                    sheet_names,
199                    path,
200                    format,
201                    styles: None,
202                })
203            }
204            SourceFormat::Csv => {
205                let backend = CsvBackend::open(&path)?;
206                let sheet_names = backend.sheet_names();
207                Ok(Self {
208                    inner: Backend::Csv(backend),
209                    sheet_names,
210                    path,
211                    format,
212                    styles: None,
213                })
214            }
215        }
216    }
217
218    pub fn path(&self) -> &Path {
219        &self.path
220    }
221
222    pub fn format(&self) -> SourceFormat {
223        self.format
224    }
225
226    pub fn sheet_names(&self) -> &[String] {
227        &self.sheet_names
228    }
229
230    /// Lazy accessor for the pre-parsed styles bundle. First call walks
231    /// `xl/styles.xml` + `xl/workbook.xml` + rels; subsequent calls reuse
232    /// the cached [`WorkbookStyles`].
233    ///
234    /// Only meaningful for `.xlsx` / `.xlsm` / `.xlam` - for other formats returns
235    /// an error since there is no OOXML styles part to parse.
236    pub fn styles(&mut self) -> Result<&mut WorkbookStyles> {
237        if self.format != SourceFormat::Xlsx {
238            return Err(Error::Xlsx(format!(
239                "styles walker only supports xlsx/xlsm/xlam; workbook format is {:?}",
240                self.format
241            )));
242        }
243        if self.styles.is_none() {
244            self.styles = Some(WorkbookStyles::load(&self.path)?);
245        }
246        Ok(self.styles.as_mut().unwrap())
247    }
248
249    /// Load a sheet by name. Reads the entire range eagerly; for huge sheets,
250    /// callers should pass a row cap downstream rather than load everything.
251    pub fn sheet(&mut self, name: &str) -> Result<Sheet> {
252        if !self.sheet_names.iter().any(|n| n == name) {
253            return Err(Error::SheetNotFound(name.to_string()));
254        }
255        match &mut self.inner {
256            Backend::Sheets(sheets) => {
257                // Styles walker is xlsx-only; other formats skip the lazy
258                // load entirely so `styles` stays None and the fallback
259                // path in `Sheet::load` never fires.
260                if self.format == SourceFormat::Xlsx && self.styles.is_none() {
261                    self.styles = WorkbookStyles::load(&self.path).ok();
262                }
263                Sheet::load(sheets, name, self.styles.as_mut())
264            }
265            Backend::Csv(csv) => csv.load_sheet(name),
266        }
267    }
268
269    /// Convenience: first sheet in workbook order.
270    pub fn first_sheet(&mut self) -> Result<Sheet> {
271        let name = self
272            .sheet_names
273            .first()
274            .ok_or_else(|| Error::SheetNotFound("(workbook has no sheets)".to_string()))?
275            .clone();
276        self.sheet(&name)
277    }
278
279    /// Workbook-level defined names as `(name, formula)` pairs. Empty for
280    /// CSV (no concept of named ranges).
281    pub fn named_ranges(&self) -> Vec<(String, String)> {
282        match &self.inner {
283            Backend::Sheets(s) => s.defined_names().to_vec(),
284            Backend::Csv(_) => Vec::new(),
285        }
286    }
287
288    /// Names of workbook tables anchored on the given sheet. Xlsx-only;
289    /// returns empty on other formats since tables are an xlsx feature.
290    pub fn table_names_in_sheet(&self, sheet_name: &str) -> Vec<String> {
291        match &self.inner {
292            Backend::Sheets(Sheets::Xlsx(x)) => x
293                .table_names_in_sheet(sheet_name)
294                .into_iter()
295                .cloned()
296                .collect(),
297            _ => Vec::new(),
298        }
299    }
300
301    /// Build a one-page summary: every sheet's dimensions, headers,
302    /// classification, and anchored tables, plus workbook-level defined
303    /// names. Loads each sheet eagerly to compute density for the
304    /// classifier — for huge workbooks the caller bears that IO cost.
305    pub fn map(&mut self) -> Result<WorkbookMap> {
306        let path = self.path.to_string_lossy().into_owned();
307        let named_ranges = self.named_ranges();
308        let names = self.sheet_names.clone();
309        let mut sheets = Vec::with_capacity(names.len());
310        for name in &names {
311            let tables = self.table_names_in_sheet(name);
312            let sheet = self.sheet(name)?;
313            let (rows, cols) = sheet.dimensions();
314            sheets.push(SheetMap {
315                name: name.clone(),
316                rows,
317                cols,
318                class: classify_sheet(&sheet),
319                headers: sheet.headers(),
320                tables,
321            });
322        }
323        Ok(WorkbookMap {
324            path,
325            sheets,
326            named_ranges,
327        })
328    }
329}