Skip to main content

sheetkit_core/workbook/
mod.rs

1//! Workbook file I/O: reading and writing `.xlsx` files.
2//!
3//! An `.xlsx` file is a ZIP archive containing XML parts. This module provides
4//! [`Workbook`] which holds the parsed XML structures in memory and can
5//! serialize them back to a valid `.xlsx` file.
6
7use std::collections::{HashMap, HashSet};
8use std::io::{Read as _, Write as _};
9use std::path::Path;
10use std::sync::OnceLock;
11
12use serde::Serialize;
13use sheetkit_xml::chart::ChartSpace;
14use sheetkit_xml::comments::Comments;
15use sheetkit_xml::content_types::{
16    mime_types, ContentTypeDefault, ContentTypeOverride, ContentTypes,
17};
18
19/// The OOXML package format, determined by the workbook content type in
20/// `[Content_Types].xml`. Controls which content type string is emitted for
21/// `xl/workbook.xml` on save.
22#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
23pub enum WorkbookFormat {
24    /// Standard spreadsheet (.xlsx).
25    #[default]
26    Xlsx,
27    /// Macro-enabled spreadsheet (.xlsm).
28    Xlsm,
29    /// Template (.xltx).
30    Xltx,
31    /// Macro-enabled template (.xltm).
32    Xltm,
33    /// Macro-enabled add-in (.xlam).
34    Xlam,
35}
36
37impl WorkbookFormat {
38    /// Infer the format from a workbook content type string found in
39    /// `[Content_Types].xml`.
40    pub fn from_content_type(ct: &str) -> Option<Self> {
41        match ct {
42            mime_types::WORKBOOK => Some(Self::Xlsx),
43            mime_types::WORKBOOK_MACRO => Some(Self::Xlsm),
44            mime_types::WORKBOOK_TEMPLATE => Some(Self::Xltx),
45            mime_types::WORKBOOK_TEMPLATE_MACRO => Some(Self::Xltm),
46            mime_types::WORKBOOK_ADDIN_MACRO => Some(Self::Xlam),
47            _ => None,
48        }
49    }
50
51    /// Infer the format from a file extension (case-insensitive, without the
52    /// leading dot). Returns `None` for unrecognized extensions.
53    pub fn from_extension(ext: &str) -> Option<Self> {
54        match ext.to_ascii_lowercase().as_str() {
55            "xlsx" => Some(Self::Xlsx),
56            "xlsm" => Some(Self::Xlsm),
57            "xltx" => Some(Self::Xltx),
58            "xltm" => Some(Self::Xltm),
59            "xlam" => Some(Self::Xlam),
60            _ => None,
61        }
62    }
63
64    /// Return the OOXML content type string for this format.
65    pub fn content_type(self) -> &'static str {
66        match self {
67            Self::Xlsx => mime_types::WORKBOOK,
68            Self::Xlsm => mime_types::WORKBOOK_MACRO,
69            Self::Xltx => mime_types::WORKBOOK_TEMPLATE,
70            Self::Xltm => mime_types::WORKBOOK_TEMPLATE_MACRO,
71            Self::Xlam => mime_types::WORKBOOK_ADDIN_MACRO,
72        }
73    }
74}
75
76use sheetkit_xml::drawing::{MarkerType, WsDr};
77use sheetkit_xml::relationships::{self, rel_types, Relationship, Relationships};
78use sheetkit_xml::shared_strings::Sst;
79use sheetkit_xml::styles::StyleSheet;
80use sheetkit_xml::workbook::{WorkbookProtection, WorkbookXml};
81use sheetkit_xml::worksheet::{Cell, CellFormula, CellTypeTag, DrawingRef, Row, WorksheetXml};
82use zip::write::SimpleFileOptions;
83use zip::CompressionMethod;
84
85use crate::cell::CellValue;
86use crate::cell_ref_shift::shift_cell_references_in_text;
87use crate::chart::ChartConfig;
88use crate::comment::CommentConfig;
89use crate::conditional::ConditionalFormatRule;
90use crate::error::{Error, Result};
91use crate::image::ImageConfig;
92use crate::pivot::{PivotTableConfig, PivotTableInfo};
93use crate::protection::WorkbookProtectionConfig;
94use crate::sst::SharedStringTable;
95use crate::threaded_comment::{PersonData, PersonInput, ThreadedCommentData, ThreadedCommentInput};
96use crate::utils::cell_ref::{cell_name_to_coordinates, column_name_to_number};
97use crate::utils::constants::MAX_CELL_CHARS;
98use crate::validation::DataValidationConfig;
99use crate::workbook_paths::{
100    default_relationships, relationship_part_path, relative_relationship_target,
101    resolve_relationship_target,
102};
103
104#[path = "aux_parts.rs"]
105pub(crate) mod aux;
106mod cell_ops;
107mod data;
108mod drawing;
109mod features;
110mod io;
111pub mod open_options;
112mod sheet_ops;
113mod source;
114
115pub use open_options::{AuxParts, DateInterpretation, OpenOptions, ReadMode};
116pub(crate) use source::PackageSource;
117
118/// Helper to initialize an `OnceLock<WorksheetXml>` with a value at
119/// construction time. Avoids repeating the `set`+`unwrap` pattern.
120pub(crate) fn initialized_lock(ws: WorksheetXml) -> OnceLock<WorksheetXml> {
121    let lock = OnceLock::new();
122    let _ = lock.set(ws);
123    lock
124}
125
126/// XML declaration prepended to every XML part in the package.
127const XML_DECLARATION: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#;
128
129/// In-memory representation of an `.xlsx` workbook.
130pub struct Workbook {
131    format: WorkbookFormat,
132    content_types: ContentTypes,
133    package_rels: Relationships,
134    workbook_xml: WorkbookXml,
135    workbook_rels: Relationships,
136    /// Per-sheet worksheet XML, stored as `(name, OnceLock<WorksheetXml>)`.
137    /// When a sheet is eagerly parsed, the `OnceLock` is initialized at open
138    /// time. When a sheet is deferred (lazy mode or filtered out), the lock
139    /// is empty and `raw_sheet_xml[i]` holds the raw bytes; the first call
140    /// to [`worksheet_ref`] or [`worksheet_mut`] hydrates the lock on demand.
141    worksheets: Vec<(String, OnceLock<WorksheetXml>)>,
142    stylesheet: StyleSheet,
143    sst_runtime: SharedStringTable,
144    /// Per-sheet comments, parallel to the `worksheets` vector.
145    sheet_comments: Vec<Option<Comments>>,
146    /// Chart parts: (zip path like "xl/charts/chart1.xml", ChartSpace data).
147    charts: Vec<(String, ChartSpace)>,
148    /// Chart parts preserved as raw XML when typed parsing is not supported.
149    raw_charts: Vec<(String, Vec<u8>)>,
150    /// Drawing parts: (zip path like "xl/drawings/drawing1.xml", WsDr data).
151    drawings: Vec<(String, WsDr)>,
152    /// Image parts: (zip path like "xl/media/image1.png", raw bytes).
153    images: Vec<(String, Vec<u8>)>,
154    /// Maps sheet index -> drawing index in `drawings`.
155    #[allow(dead_code)]
156    worksheet_drawings: HashMap<usize, usize>,
157    /// Per-sheet worksheet relationship files.
158    worksheet_rels: HashMap<usize, Relationships>,
159    /// Per-drawing relationship files: drawing_index -> Relationships.
160    drawing_rels: HashMap<usize, Relationships>,
161    /// Core document properties (docProps/core.xml).
162    core_properties: Option<sheetkit_xml::doc_props::CoreProperties>,
163    /// Extended/application properties (docProps/app.xml).
164    app_properties: Option<sheetkit_xml::doc_props::ExtendedProperties>,
165    /// Custom properties (docProps/custom.xml).
166    custom_properties: Option<sheetkit_xml::doc_props::CustomProperties>,
167    /// Pivot table parts: (zip path, PivotTableDefinition data).
168    pivot_tables: Vec<(String, sheetkit_xml::pivot_table::PivotTableDefinition)>,
169    /// Pivot cache definition parts: (zip path, PivotCacheDefinition data).
170    pivot_cache_defs: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheDefinition)>,
171    /// Pivot cache records parts: (zip path, PivotCacheRecords data).
172    pivot_cache_records: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheRecords)>,
173    /// Raw theme XML bytes from xl/theme/theme1.xml (preserved for round-trip).
174    theme_xml: Option<Vec<u8>>,
175    /// Parsed theme colors from the theme XML.
176    theme_colors: sheetkit_xml::theme::ThemeColors,
177    /// Per-sheet sparkline configurations, parallel to the `worksheets` vector.
178    sheet_sparklines: Vec<Vec<crate::sparkline::SparklineConfig>>,
179    /// Per-sheet VML drawing bytes (for legacy comment rendering), parallel to `worksheets`.
180    /// `None` means no VML part exists for that sheet.
181    sheet_vml: Vec<Option<Vec<u8>>>,
182    /// ZIP entries not recognized by the parser, preserved for round-trip fidelity.
183    /// Each entry is (zip_path, raw_bytes).
184    unknown_parts: Vec<(String, Vec<u8>)>,
185    /// Typed index of auxiliary parts deferred during Lazy/Stream open.
186    /// Stores raw bytes grouped by category (comments, charts, doc props, etc.)
187    /// and supports on-demand hydration with dirty tracking.
188    deferred_parts: aux::DeferredAuxParts,
189    /// Raw VBA project binary blob (`xl/vbaProject.bin`), preserved for round-trip
190    /// and used for VBA module extraction. `None` for non-macro workbooks.
191    vba_blob: Option<Vec<u8>>,
192    /// Table parts: (zip path like "xl/tables/table1.xml", TableXml data, sheet_index).
193    tables: Vec<(String, sheetkit_xml::table::TableXml, usize)>,
194    /// Raw XML bytes for sheets that were not parsed during open.
195    /// Parallel to `worksheets`. `Some(bytes)` means the sheet XML has not
196    /// been deserialized: either filtered out by the `sheets` option, or
197    /// deferred in Lazy/Stream mode. The bytes are written directly on save
198    /// if the corresponding `OnceLock` in `worksheets` was never initialized.
199    raw_sheet_xml: Vec<Option<Vec<u8>>>,
200    /// Per-sheet dirty flag, parallel to `worksheets`. A sheet is marked
201    /// dirty when it is mutated (via `worksheet_mut`, `set_cell_value`, etc.).
202    /// Clean sheets with available raw bytes are written via passthrough on
203    /// save, avoiding serialization overhead.
204    sheet_dirty: Vec<bool>,
205    /// Slicer definition parts: (zip path, SlicerDefinitions data).
206    slicer_defs: Vec<(String, sheetkit_xml::slicer::SlicerDefinitions)>,
207    /// Slicer cache definition parts: (zip path, raw XML string).
208    slicer_caches: Vec<(String, sheetkit_xml::slicer::SlicerCacheDefinition)>,
209    /// Per-sheet threaded comments (Excel 2019+), parallel to the `worksheets` vector.
210    sheet_threaded_comments: Vec<Option<sheetkit_xml::threaded_comment::ThreadedComments>>,
211    /// Person list shared across all sheets (for threaded comment authors).
212    person_list: sheetkit_xml::threaded_comment::PersonList,
213    /// Per-sheet form control configurations, parallel to `worksheets`.
214    sheet_form_controls: Vec<Vec<crate::control::FormControlConfig>>,
215    /// O(1) sheet name -> index lookup cache. Must be kept in sync with
216    /// `worksheets` via [`rebuild_sheet_index`].
217    sheet_name_index: HashMap<String, usize>,
218    /// Streamed sheet data keyed by sheet index. During save, these sheets
219    /// are written by streaming from their temp files instead of serializing
220    /// the (empty placeholder) WorksheetXml.
221    streamed_sheets: HashMap<usize, crate::stream::StreamedSheetData>,
222    /// Backing storage for the xlsx package, retained for lazy part access.
223    #[allow(dead_code)]
224    package_source: Option<PackageSource>,
225    /// Read mode used when this workbook was opened.
226    read_mode: ReadMode,
227    /// Optional row limit from `OpenOptions::sheet_rows`, applied during
228    /// on-demand hydration of deferred sheets.
229    sheet_rows_limit: Option<u32>,
230    /// Controls whether number cells carrying a date-like number format
231    /// should be surfaced as [`CellValue::Date`](crate::cell::CellValue::Date).
232    /// Sourced from [`OpenOptions::date_interpretation`].
233    date_interpretation: DateInterpretation,
234}
235
236impl Workbook {
237    /// Return the detected or assigned workbook format.
238    pub fn format(&self) -> WorkbookFormat {
239        self.format
240    }
241
242    /// Set the workbook format. This determines the content type written for
243    /// `xl/workbook.xml` on save.
244    pub fn set_format(&mut self, format: WorkbookFormat) {
245        self.format = format;
246    }
247
248    /// Get the 0-based index of a sheet by name. O(1) via HashMap.
249    pub(crate) fn sheet_index(&self, sheet: &str) -> Result<usize> {
250        self.sheet_name_index
251            .get(sheet)
252            .copied()
253            .ok_or_else(|| Error::SheetNotFound {
254                name: sheet.to_string(),
255            })
256    }
257
258    /// Invalidate streamed data for a sheet by index. Must be called before
259    /// any mutation to a sheet that may have been created via StreamWriter,
260    /// so that the normal WorksheetXml serialization path is used on save.
261    pub(crate) fn invalidate_streamed(&mut self, idx: usize) {
262        self.streamed_sheets.remove(&idx);
263    }
264
265    /// Mark a sheet as dirty (modified). Dirty sheets are always serialized
266    /// on save, even if raw bytes exist. Clean sheets can use raw-byte
267    /// passthrough for zero-cost round-trip.
268    pub(crate) fn mark_sheet_dirty(&mut self, idx: usize) {
269        if idx < self.sheet_dirty.len() {
270            self.sheet_dirty[idx] = true;
271        }
272    }
273
274    /// Check whether a sheet has been marked dirty since opening.
275    #[cfg(test)]
276    pub(crate) fn is_sheet_dirty(&self, idx: usize) -> bool {
277        self.sheet_dirty.get(idx).copied().unwrap_or(false)
278    }
279
280    /// Get a mutable reference to the worksheet XML for the named sheet.
281    ///
282    /// If the sheet has streamed data (from [`apply_stream_writer`]), the
283    /// streamed entry is removed so that subsequent edits are not silently
284    /// ignored on save. Deferred sheets are hydrated on demand.
285    pub(crate) fn worksheet_mut(&mut self, sheet: &str) -> Result<&mut WorksheetXml> {
286        let idx = self.sheet_index(sheet)?;
287        self.invalidate_streamed(idx);
288        self.ensure_hydrated(idx)?;
289        self.mark_sheet_dirty(idx);
290        Ok(self.worksheets[idx].1.get_mut().unwrap())
291    }
292
293    /// Get an immutable reference to the worksheet XML for the named sheet.
294    /// Deferred sheets are hydrated lazily via `OnceLock`.
295    pub(crate) fn worksheet_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
296        let idx = self.sheet_index(sheet)?;
297        self.worksheet_ref_by_index(idx)
298    }
299
300    /// Get an immutable reference to the worksheet XML by index.
301    /// Deferred sheets are hydrated lazily via `OnceLock`.
302    pub(crate) fn worksheet_ref_by_index(&self, idx: usize) -> Result<&WorksheetXml> {
303        if let Some(ws) = self.worksheets[idx].1.get() {
304            return Ok(ws);
305        }
306        // Hydrate from raw_sheet_xml on first access.
307        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
308            let mut ws = io::deserialize_worksheet_xml(bytes)?;
309            if let Some(max_rows) = self.sheet_rows_limit {
310                ws.sheet_data.rows.truncate(max_rows as usize);
311            }
312            Ok(self.worksheets[idx].1.get_or_init(|| ws))
313        } else {
314            Err(Error::Internal(format!(
315                "sheet at index {} has no materialized or deferred data",
316                idx
317            )))
318        }
319    }
320
321    /// Public immutable reference to a worksheet's XML by sheet name.
322    /// Deferred sheets are hydrated lazily on first access.
323    pub fn worksheet_xml_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
324        self.worksheet_ref(sheet)
325    }
326
327    /// Public immutable reference to the shared string table.
328    pub fn sst_ref(&self) -> &SharedStringTable {
329        &self.sst_runtime
330    }
331
332    /// Rebuild the sheet name -> index lookup after any structural change
333    /// to the worksheets vector.
334    pub(crate) fn rebuild_sheet_index(&mut self) {
335        self.sheet_name_index.clear();
336        for (i, (name, _ws_lock)) in self.worksheets.iter().enumerate() {
337            self.sheet_name_index.insert(name.clone(), i);
338        }
339    }
340
341    /// Ensure the sheet at the given index is hydrated (parsed from raw XML).
342    /// This is used by `&mut self` methods that need a mutable `OnceLock`
343    /// reference via `get_mut()`, which requires the lock to be initialized.
344    fn ensure_hydrated(&mut self, idx: usize) -> Result<()> {
345        if self.worksheets[idx].1.get().is_some() {
346            // OnceLock is set. If raw bytes are still present, this is a
347            // placeholder (filtered-out sheet with WorksheetXml::default()).
348            // Replace the placeholder with properly parsed data.
349            if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
350                let mut ws = io::deserialize_worksheet_xml(bytes)?;
351                if let Some(max_rows) = self.sheet_rows_limit {
352                    ws.sheet_data.rows.truncate(max_rows as usize);
353                }
354                *self.worksheets[idx].1.get_mut().unwrap() = ws;
355                self.raw_sheet_xml[idx] = None;
356            }
357            return Ok(());
358        }
359        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
360            let mut ws = io::deserialize_worksheet_xml(bytes)?;
361            if let Some(max_rows) = self.sheet_rows_limit {
362                ws.sheet_data.rows.truncate(max_rows as usize);
363            }
364            let _ = self.worksheets[idx].1.set(ws);
365            self.raw_sheet_xml[idx] = None;
366            Ok(())
367        } else {
368            Err(Error::Internal(format!(
369                "sheet at index {} has no materialized or deferred data",
370                idx
371            )))
372        }
373    }
374
375    /// Hydrate if needed and return a mutable reference to the worksheet
376    /// at the given index. Callers must hold `&mut self`.
377    pub(crate) fn worksheet_mut_by_index(&mut self, idx: usize) -> Result<&mut WorksheetXml> {
378        self.ensure_hydrated(idx)?;
379        self.mark_sheet_dirty(idx);
380        Ok(self.worksheets[idx].1.get_mut().unwrap())
381    }
382
383    /// Resolve the part path for a sheet index from workbook relationships.
384    /// Falls back to the default `xl/worksheets/sheet{N}.xml` naming.
385    pub(crate) fn sheet_part_path(&self, sheet_idx: usize) -> String {
386        if let Some(sheet_entry) = self.workbook_xml.sheets.sheets.get(sheet_idx) {
387            if let Some(rel) = self
388                .workbook_rels
389                .relationships
390                .iter()
391                .find(|r| r.id == sheet_entry.r_id && r.rel_type == rel_types::WORKSHEET)
392            {
393                return resolve_relationship_target("xl/workbook.xml", &rel.target);
394            }
395        }
396        format!("xl/worksheets/sheet{}.xml", sheet_idx + 1)
397    }
398
399    /// Create a forward-only streaming reader for the named sheet.
400    ///
401    /// The reader processes worksheet XML row-by-row without materializing the
402    /// full DOM, enabling bounded-memory processing of large worksheets. The
403    /// workbook's shared string table and optional `sheet_rows` limit are
404    /// passed through to the reader.
405    ///
406    /// The XML bytes come from `raw_sheet_xml` (deferred sheets in Lazy/Stream
407    /// mode) or from a freshly hydrated worksheet serialized back to bytes.
408    pub fn open_sheet_reader(
409        &self,
410        sheet: &str,
411    ) -> Result<
412        crate::stream_reader::SheetStreamReader<'_, std::io::BufReader<std::io::Cursor<Vec<u8>>>>,
413    > {
414        let idx = self.sheet_index(sheet)?;
415        let xml_bytes = self.sheet_xml_bytes(idx)?;
416        let cursor = std::io::Cursor::new(xml_bytes);
417        let buf_reader = std::io::BufReader::new(cursor);
418        Ok(
419            crate::stream_reader::SheetStreamReader::new(buf_reader, &self.sst_runtime)
420                .row_limit(self.sheet_rows_limit)
421                .date_promotion(self.date_interpretation, self.computed_style_is_date()),
422        )
423    }
424
425    /// Create an owned forward-only streaming reader for the named sheet.
426    ///
427    /// Unlike [`open_sheet_reader`], the returned reader owns its shared
428    /// string table snapshot and XML bytes, so it has no lifetime tied to
429    /// the workbook. This is suitable for FFI contexts (e.g., napi classes)
430    /// where lifetime parameters are not supported.
431    pub fn open_sheet_reader_owned(
432        &self,
433        sheet: &str,
434    ) -> Result<crate::stream_reader::OwnedSheetStreamReader> {
435        let idx = self.sheet_index(sheet)?;
436        let xml_bytes = self.sheet_xml_bytes(idx)?;
437        let sst_snapshot = self.sst_runtime.clone_for_read();
438        Ok(
439            crate::stream_reader::OwnedSheetStreamReader::new(xml_bytes, sst_snapshot)
440                .row_limit(self.sheet_rows_limit)
441                .date_promotion(self.date_interpretation, self.computed_style_is_date()),
442        )
443    }
444
445    /// Precompute, for each cellXf, whether its number format is a date
446    /// format. Used by streaming readers to decide whether to promote
447    /// `t="n"` cells to `CellValue::Date` under
448    /// [`DateInterpretation::NumFmt`].
449    ///
450    /// Returns an empty vector when the [`DateInterpretation::CellType`]
451    /// mode is active; stream readers short-circuit on interpretation
452    /// alone before consulting the lookup, so the empty allocation is a
453    /// cheap default and the lookup work is skipped entirely.
454    fn computed_style_is_date(&self) -> Vec<bool> {
455        if matches!(self.date_interpretation, DateInterpretation::CellType) {
456            return Vec::new();
457        }
458        crate::style::compute_style_is_date(&self.stylesheet)
459    }
460
461    /// Get the raw XML bytes for a sheet by index.
462    ///
463    /// When the OnceLock is uninitialised (Lazy/Stream deferred), raw bytes
464    /// from `raw_sheet_xml` are used so the DOM is never materialised. When
465    /// the OnceLock IS initialised (Eager parse or filtered-out sheet), the
466    /// parsed worksheet is serialised back so that `sheets(...)` filtering is
467    /// respected (filtered sheets have an empty worksheet placeholder).
468    ///
469    /// The returned bytes are cloned because the `SheetStreamReader` takes
470    /// ownership of its `BufRead` source.
471    fn sheet_xml_bytes(&self, idx: usize) -> Result<Vec<u8>> {
472        // If the OnceLock is already initialised (eager parse OR filtered-out
473        // placeholder), serialise whatever is stored there. This ensures
474        // filtered-out sheets yield an empty worksheet.
475        if let Some(ws) = self.worksheets[idx].1.get() {
476            let xml = quick_xml::se::to_string(ws)
477                .map_err(|e| Error::Internal(format!("failed to serialize worksheet: {e}")))?;
478            return Ok(xml.into_bytes());
479        }
480        // Lazy/Stream deferred: OnceLock not yet initialised, use raw bytes.
481        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
482            return Ok(bytes.clone());
483        }
484        Err(Error::Internal(format!(
485            "sheet at index {} has no materialized or deferred data",
486            idx
487        )))
488    }
489}