sheetkit_core/workbook/mod.rs
1//! Workbook file I/O: reading and writing `.xlsx` files.
2//!
3//! An `.xlsx` file is a ZIP archive containing XML parts. This module provides
4//! [`Workbook`] which holds the parsed XML structures in memory and can
5//! serialize them back to a valid `.xlsx` file.
6
7use std::collections::{HashMap, HashSet};
8use std::io::{Read as _, Write as _};
9use std::path::Path;
10use std::sync::OnceLock;
11
12use serde::Serialize;
13use sheetkit_xml::chart::ChartSpace;
14use sheetkit_xml::comments::Comments;
15use sheetkit_xml::content_types::{
16 mime_types, ContentTypeDefault, ContentTypeOverride, ContentTypes,
17};
18
19/// The OOXML package format, determined by the workbook content type in
20/// `[Content_Types].xml`. Controls which content type string is emitted for
21/// `xl/workbook.xml` on save.
22#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
23pub enum WorkbookFormat {
24 /// Standard spreadsheet (.xlsx).
25 #[default]
26 Xlsx,
27 /// Macro-enabled spreadsheet (.xlsm).
28 Xlsm,
29 /// Template (.xltx).
30 Xltx,
31 /// Macro-enabled template (.xltm).
32 Xltm,
33 /// Macro-enabled add-in (.xlam).
34 Xlam,
35}
36
37impl WorkbookFormat {
38 /// Infer the format from a workbook content type string found in
39 /// `[Content_Types].xml`.
40 pub fn from_content_type(ct: &str) -> Option<Self> {
41 match ct {
42 mime_types::WORKBOOK => Some(Self::Xlsx),
43 mime_types::WORKBOOK_MACRO => Some(Self::Xlsm),
44 mime_types::WORKBOOK_TEMPLATE => Some(Self::Xltx),
45 mime_types::WORKBOOK_TEMPLATE_MACRO => Some(Self::Xltm),
46 mime_types::WORKBOOK_ADDIN_MACRO => Some(Self::Xlam),
47 _ => None,
48 }
49 }
50
51 /// Infer the format from a file extension (case-insensitive, without the
52 /// leading dot). Returns `None` for unrecognized extensions.
53 pub fn from_extension(ext: &str) -> Option<Self> {
54 match ext.to_ascii_lowercase().as_str() {
55 "xlsx" => Some(Self::Xlsx),
56 "xlsm" => Some(Self::Xlsm),
57 "xltx" => Some(Self::Xltx),
58 "xltm" => Some(Self::Xltm),
59 "xlam" => Some(Self::Xlam),
60 _ => None,
61 }
62 }
63
64 /// Return the OOXML content type string for this format.
65 pub fn content_type(self) -> &'static str {
66 match self {
67 Self::Xlsx => mime_types::WORKBOOK,
68 Self::Xlsm => mime_types::WORKBOOK_MACRO,
69 Self::Xltx => mime_types::WORKBOOK_TEMPLATE,
70 Self::Xltm => mime_types::WORKBOOK_TEMPLATE_MACRO,
71 Self::Xlam => mime_types::WORKBOOK_ADDIN_MACRO,
72 }
73 }
74}
75
76use sheetkit_xml::drawing::{MarkerType, WsDr};
77use sheetkit_xml::relationships::{self, rel_types, Relationship, Relationships};
78use sheetkit_xml::shared_strings::Sst;
79use sheetkit_xml::styles::StyleSheet;
80use sheetkit_xml::workbook::{WorkbookProtection, WorkbookXml};
81use sheetkit_xml::worksheet::{Cell, CellFormula, CellTypeTag, DrawingRef, Row, WorksheetXml};
82use zip::write::SimpleFileOptions;
83use zip::CompressionMethod;
84
85use crate::cell::CellValue;
86use crate::cell_ref_shift::shift_cell_references_in_text;
87use crate::chart::ChartConfig;
88use crate::comment::CommentConfig;
89use crate::conditional::ConditionalFormatRule;
90use crate::error::{Error, Result};
91use crate::image::ImageConfig;
92use crate::pivot::{PivotTableConfig, PivotTableInfo};
93use crate::protection::WorkbookProtectionConfig;
94use crate::sst::SharedStringTable;
95use crate::threaded_comment::{PersonData, PersonInput, ThreadedCommentData, ThreadedCommentInput};
96use crate::utils::cell_ref::{cell_name_to_coordinates, column_name_to_number};
97use crate::utils::constants::MAX_CELL_CHARS;
98use crate::validation::DataValidationConfig;
99use crate::workbook_paths::{
100 default_relationships, relationship_part_path, relative_relationship_target,
101 resolve_relationship_target,
102};
103
104#[path = "aux_parts.rs"]
105pub(crate) mod aux;
106mod cell_ops;
107mod data;
108mod drawing;
109mod features;
110mod io;
111pub mod open_options;
112mod sheet_ops;
113mod source;
114
115pub use open_options::{AuxParts, DateInterpretation, OpenOptions, ReadMode};
116pub(crate) use source::PackageSource;
117
118/// Helper to initialize an `OnceLock<WorksheetXml>` with a value at
119/// construction time. Avoids repeating the `set`+`unwrap` pattern.
120pub(crate) fn initialized_lock(ws: WorksheetXml) -> OnceLock<WorksheetXml> {
121 let lock = OnceLock::new();
122 let _ = lock.set(ws);
123 lock
124}
125
126/// XML declaration prepended to every XML part in the package.
127const XML_DECLARATION: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#;
128
129/// In-memory representation of an `.xlsx` workbook.
130pub struct Workbook {
131 format: WorkbookFormat,
132 content_types: ContentTypes,
133 package_rels: Relationships,
134 workbook_xml: WorkbookXml,
135 workbook_rels: Relationships,
136 /// Per-sheet worksheet XML, stored as `(name, OnceLock<WorksheetXml>)`.
137 /// When a sheet is eagerly parsed, the `OnceLock` is initialized at open
138 /// time. When a sheet is deferred (lazy mode or filtered out), the lock
139 /// is empty and `raw_sheet_xml[i]` holds the raw bytes; the first call
140 /// to [`worksheet_ref`] or [`worksheet_mut`] hydrates the lock on demand.
141 worksheets: Vec<(String, OnceLock<WorksheetXml>)>,
142 stylesheet: StyleSheet,
143 sst_runtime: SharedStringTable,
144 /// Per-sheet comments, parallel to the `worksheets` vector.
145 sheet_comments: Vec<Option<Comments>>,
146 /// Chart parts: (zip path like "xl/charts/chart1.xml", ChartSpace data).
147 charts: Vec<(String, ChartSpace)>,
148 /// Chart parts preserved as raw XML when typed parsing is not supported.
149 raw_charts: Vec<(String, Vec<u8>)>,
150 /// Drawing parts: (zip path like "xl/drawings/drawing1.xml", WsDr data).
151 drawings: Vec<(String, WsDr)>,
152 /// Image parts: (zip path like "xl/media/image1.png", raw bytes).
153 images: Vec<(String, Vec<u8>)>,
154 /// Maps sheet index -> drawing index in `drawings`.
155 #[allow(dead_code)]
156 worksheet_drawings: HashMap<usize, usize>,
157 /// Per-sheet worksheet relationship files.
158 worksheet_rels: HashMap<usize, Relationships>,
159 /// Per-drawing relationship files: drawing_index -> Relationships.
160 drawing_rels: HashMap<usize, Relationships>,
161 /// Core document properties (docProps/core.xml).
162 core_properties: Option<sheetkit_xml::doc_props::CoreProperties>,
163 /// Extended/application properties (docProps/app.xml).
164 app_properties: Option<sheetkit_xml::doc_props::ExtendedProperties>,
165 /// Custom properties (docProps/custom.xml).
166 custom_properties: Option<sheetkit_xml::doc_props::CustomProperties>,
167 /// Pivot table parts: (zip path, PivotTableDefinition data).
168 pivot_tables: Vec<(String, sheetkit_xml::pivot_table::PivotTableDefinition)>,
169 /// Pivot cache definition parts: (zip path, PivotCacheDefinition data).
170 pivot_cache_defs: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheDefinition)>,
171 /// Pivot cache records parts: (zip path, PivotCacheRecords data).
172 pivot_cache_records: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheRecords)>,
173 /// Raw theme XML bytes from xl/theme/theme1.xml (preserved for round-trip).
174 theme_xml: Option<Vec<u8>>,
175 /// Parsed theme colors from the theme XML.
176 theme_colors: sheetkit_xml::theme::ThemeColors,
177 /// Per-sheet sparkline configurations, parallel to the `worksheets` vector.
178 sheet_sparklines: Vec<Vec<crate::sparkline::SparklineConfig>>,
179 /// Per-sheet VML drawing bytes (for legacy comment rendering), parallel to `worksheets`.
180 /// `None` means no VML part exists for that sheet.
181 sheet_vml: Vec<Option<Vec<u8>>>,
182 /// ZIP entries not recognized by the parser, preserved for round-trip fidelity.
183 /// Each entry is (zip_path, raw_bytes).
184 unknown_parts: Vec<(String, Vec<u8>)>,
185 /// Typed index of auxiliary parts deferred during Lazy/Stream open.
186 /// Stores raw bytes grouped by category (comments, charts, doc props, etc.)
187 /// and supports on-demand hydration with dirty tracking.
188 deferred_parts: aux::DeferredAuxParts,
189 /// Raw VBA project binary blob (`xl/vbaProject.bin`), preserved for round-trip
190 /// and used for VBA module extraction. `None` for non-macro workbooks.
191 vba_blob: Option<Vec<u8>>,
192 /// Table parts: (zip path like "xl/tables/table1.xml", TableXml data, sheet_index).
193 tables: Vec<(String, sheetkit_xml::table::TableXml, usize)>,
194 /// Raw XML bytes for sheets that were not parsed during open.
195 /// Parallel to `worksheets`. `Some(bytes)` means the sheet XML has not
196 /// been deserialized: either filtered out by the `sheets` option, or
197 /// deferred in Lazy/Stream mode. The bytes are written directly on save
198 /// if the corresponding `OnceLock` in `worksheets` was never initialized.
199 raw_sheet_xml: Vec<Option<Vec<u8>>>,
200 /// Per-sheet dirty flag, parallel to `worksheets`. A sheet is marked
201 /// dirty when it is mutated (via `worksheet_mut`, `set_cell_value`, etc.).
202 /// Clean sheets with available raw bytes are written via passthrough on
203 /// save, avoiding serialization overhead.
204 sheet_dirty: Vec<bool>,
205 /// Slicer definition parts: (zip path, SlicerDefinitions data).
206 slicer_defs: Vec<(String, sheetkit_xml::slicer::SlicerDefinitions)>,
207 /// Slicer cache definition parts: (zip path, raw XML string).
208 slicer_caches: Vec<(String, sheetkit_xml::slicer::SlicerCacheDefinition)>,
209 /// Per-sheet threaded comments (Excel 2019+), parallel to the `worksheets` vector.
210 sheet_threaded_comments: Vec<Option<sheetkit_xml::threaded_comment::ThreadedComments>>,
211 /// Person list shared across all sheets (for threaded comment authors).
212 person_list: sheetkit_xml::threaded_comment::PersonList,
213 /// Per-sheet form control configurations, parallel to `worksheets`.
214 sheet_form_controls: Vec<Vec<crate::control::FormControlConfig>>,
215 /// O(1) sheet name -> index lookup cache. Must be kept in sync with
216 /// `worksheets` via [`rebuild_sheet_index`].
217 sheet_name_index: HashMap<String, usize>,
218 /// Streamed sheet data keyed by sheet index. During save, these sheets
219 /// are written by streaming from their temp files instead of serializing
220 /// the (empty placeholder) WorksheetXml.
221 streamed_sheets: HashMap<usize, crate::stream::StreamedSheetData>,
222 /// Backing storage for the xlsx package, retained for lazy part access.
223 #[allow(dead_code)]
224 package_source: Option<PackageSource>,
225 /// Read mode used when this workbook was opened.
226 read_mode: ReadMode,
227 /// Optional row limit from `OpenOptions::sheet_rows`, applied during
228 /// on-demand hydration of deferred sheets.
229 sheet_rows_limit: Option<u32>,
230 /// Controls whether number cells carrying a date-like number format
231 /// should be surfaced as [`CellValue::Date`](crate::cell::CellValue::Date).
232 /// Sourced from [`OpenOptions::date_interpretation`].
233 date_interpretation: DateInterpretation,
234}
235
236impl Workbook {
237 /// Return the detected or assigned workbook format.
238 pub fn format(&self) -> WorkbookFormat {
239 self.format
240 }
241
242 /// Set the workbook format. This determines the content type written for
243 /// `xl/workbook.xml` on save.
244 pub fn set_format(&mut self, format: WorkbookFormat) {
245 self.format = format;
246 }
247
248 /// Get the 0-based index of a sheet by name. O(1) via HashMap.
249 pub(crate) fn sheet_index(&self, sheet: &str) -> Result<usize> {
250 self.sheet_name_index
251 .get(sheet)
252 .copied()
253 .ok_or_else(|| Error::SheetNotFound {
254 name: sheet.to_string(),
255 })
256 }
257
258 /// Invalidate streamed data for a sheet by index. Must be called before
259 /// any mutation to a sheet that may have been created via StreamWriter,
260 /// so that the normal WorksheetXml serialization path is used on save.
261 pub(crate) fn invalidate_streamed(&mut self, idx: usize) {
262 self.streamed_sheets.remove(&idx);
263 }
264
265 /// Mark a sheet as dirty (modified). Dirty sheets are always serialized
266 /// on save, even if raw bytes exist. Clean sheets can use raw-byte
267 /// passthrough for zero-cost round-trip.
268 pub(crate) fn mark_sheet_dirty(&mut self, idx: usize) {
269 if idx < self.sheet_dirty.len() {
270 self.sheet_dirty[idx] = true;
271 }
272 }
273
274 /// Check whether a sheet has been marked dirty since opening.
275 #[cfg(test)]
276 pub(crate) fn is_sheet_dirty(&self, idx: usize) -> bool {
277 self.sheet_dirty.get(idx).copied().unwrap_or(false)
278 }
279
280 /// Get a mutable reference to the worksheet XML for the named sheet.
281 ///
282 /// If the sheet has streamed data (from [`apply_stream_writer`]), the
283 /// streamed entry is removed so that subsequent edits are not silently
284 /// ignored on save. Deferred sheets are hydrated on demand.
285 pub(crate) fn worksheet_mut(&mut self, sheet: &str) -> Result<&mut WorksheetXml> {
286 let idx = self.sheet_index(sheet)?;
287 self.invalidate_streamed(idx);
288 self.ensure_hydrated(idx)?;
289 self.mark_sheet_dirty(idx);
290 Ok(self.worksheets[idx].1.get_mut().unwrap())
291 }
292
293 /// Get an immutable reference to the worksheet XML for the named sheet.
294 /// Deferred sheets are hydrated lazily via `OnceLock`.
295 pub(crate) fn worksheet_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
296 let idx = self.sheet_index(sheet)?;
297 self.worksheet_ref_by_index(idx)
298 }
299
300 /// Get an immutable reference to the worksheet XML by index.
301 /// Deferred sheets are hydrated lazily via `OnceLock`.
302 pub(crate) fn worksheet_ref_by_index(&self, idx: usize) -> Result<&WorksheetXml> {
303 if let Some(ws) = self.worksheets[idx].1.get() {
304 return Ok(ws);
305 }
306 // Hydrate from raw_sheet_xml on first access.
307 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
308 let mut ws = io::deserialize_worksheet_xml(bytes)?;
309 if let Some(max_rows) = self.sheet_rows_limit {
310 ws.sheet_data.rows.truncate(max_rows as usize);
311 }
312 Ok(self.worksheets[idx].1.get_or_init(|| ws))
313 } else {
314 Err(Error::Internal(format!(
315 "sheet at index {} has no materialized or deferred data",
316 idx
317 )))
318 }
319 }
320
321 /// Public immutable reference to a worksheet's XML by sheet name.
322 /// Deferred sheets are hydrated lazily on first access.
323 pub fn worksheet_xml_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
324 self.worksheet_ref(sheet)
325 }
326
327 /// Public immutable reference to the shared string table.
328 pub fn sst_ref(&self) -> &SharedStringTable {
329 &self.sst_runtime
330 }
331
332 /// Rebuild the sheet name -> index lookup after any structural change
333 /// to the worksheets vector.
334 pub(crate) fn rebuild_sheet_index(&mut self) {
335 self.sheet_name_index.clear();
336 for (i, (name, _ws_lock)) in self.worksheets.iter().enumerate() {
337 self.sheet_name_index.insert(name.clone(), i);
338 }
339 }
340
341 /// Ensure the sheet at the given index is hydrated (parsed from raw XML).
342 /// This is used by `&mut self` methods that need a mutable `OnceLock`
343 /// reference via `get_mut()`, which requires the lock to be initialized.
344 fn ensure_hydrated(&mut self, idx: usize) -> Result<()> {
345 if self.worksheets[idx].1.get().is_some() {
346 // OnceLock is set. If raw bytes are still present, this is a
347 // placeholder (filtered-out sheet with WorksheetXml::default()).
348 // Replace the placeholder with properly parsed data.
349 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
350 let mut ws = io::deserialize_worksheet_xml(bytes)?;
351 if let Some(max_rows) = self.sheet_rows_limit {
352 ws.sheet_data.rows.truncate(max_rows as usize);
353 }
354 *self.worksheets[idx].1.get_mut().unwrap() = ws;
355 self.raw_sheet_xml[idx] = None;
356 }
357 return Ok(());
358 }
359 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
360 let mut ws = io::deserialize_worksheet_xml(bytes)?;
361 if let Some(max_rows) = self.sheet_rows_limit {
362 ws.sheet_data.rows.truncate(max_rows as usize);
363 }
364 let _ = self.worksheets[idx].1.set(ws);
365 self.raw_sheet_xml[idx] = None;
366 Ok(())
367 } else {
368 Err(Error::Internal(format!(
369 "sheet at index {} has no materialized or deferred data",
370 idx
371 )))
372 }
373 }
374
375 /// Hydrate if needed and return a mutable reference to the worksheet
376 /// at the given index. Callers must hold `&mut self`.
377 pub(crate) fn worksheet_mut_by_index(&mut self, idx: usize) -> Result<&mut WorksheetXml> {
378 self.ensure_hydrated(idx)?;
379 self.mark_sheet_dirty(idx);
380 Ok(self.worksheets[idx].1.get_mut().unwrap())
381 }
382
383 /// Resolve the part path for a sheet index from workbook relationships.
384 /// Falls back to the default `xl/worksheets/sheet{N}.xml` naming.
385 pub(crate) fn sheet_part_path(&self, sheet_idx: usize) -> String {
386 if let Some(sheet_entry) = self.workbook_xml.sheets.sheets.get(sheet_idx) {
387 if let Some(rel) = self
388 .workbook_rels
389 .relationships
390 .iter()
391 .find(|r| r.id == sheet_entry.r_id && r.rel_type == rel_types::WORKSHEET)
392 {
393 return resolve_relationship_target("xl/workbook.xml", &rel.target);
394 }
395 }
396 format!("xl/worksheets/sheet{}.xml", sheet_idx + 1)
397 }
398
399 /// Create a forward-only streaming reader for the named sheet.
400 ///
401 /// The reader processes worksheet XML row-by-row without materializing the
402 /// full DOM, enabling bounded-memory processing of large worksheets. The
403 /// workbook's shared string table and optional `sheet_rows` limit are
404 /// passed through to the reader.
405 ///
406 /// The XML bytes come from `raw_sheet_xml` (deferred sheets in Lazy/Stream
407 /// mode) or from a freshly hydrated worksheet serialized back to bytes.
408 pub fn open_sheet_reader(
409 &self,
410 sheet: &str,
411 ) -> Result<
412 crate::stream_reader::SheetStreamReader<'_, std::io::BufReader<std::io::Cursor<Vec<u8>>>>,
413 > {
414 let idx = self.sheet_index(sheet)?;
415 let xml_bytes = self.sheet_xml_bytes(idx)?;
416 let cursor = std::io::Cursor::new(xml_bytes);
417 let buf_reader = std::io::BufReader::new(cursor);
418 Ok(
419 crate::stream_reader::SheetStreamReader::new(buf_reader, &self.sst_runtime)
420 .row_limit(self.sheet_rows_limit)
421 .date_promotion(self.date_interpretation, self.computed_style_is_date()),
422 )
423 }
424
425 /// Create an owned forward-only streaming reader for the named sheet.
426 ///
427 /// Unlike [`open_sheet_reader`], the returned reader owns its shared
428 /// string table snapshot and XML bytes, so it has no lifetime tied to
429 /// the workbook. This is suitable for FFI contexts (e.g., napi classes)
430 /// where lifetime parameters are not supported.
431 pub fn open_sheet_reader_owned(
432 &self,
433 sheet: &str,
434 ) -> Result<crate::stream_reader::OwnedSheetStreamReader> {
435 let idx = self.sheet_index(sheet)?;
436 let xml_bytes = self.sheet_xml_bytes(idx)?;
437 let sst_snapshot = self.sst_runtime.clone_for_read();
438 Ok(
439 crate::stream_reader::OwnedSheetStreamReader::new(xml_bytes, sst_snapshot)
440 .row_limit(self.sheet_rows_limit)
441 .date_promotion(self.date_interpretation, self.computed_style_is_date()),
442 )
443 }
444
445 /// Precompute, for each cellXf, whether its number format is a date
446 /// format. Used by streaming readers to decide whether to promote
447 /// `t="n"` cells to `CellValue::Date` under
448 /// [`DateInterpretation::NumFmt`].
449 ///
450 /// Returns an empty vector when the [`DateInterpretation::CellType`]
451 /// mode is active; stream readers short-circuit on interpretation
452 /// alone before consulting the lookup, so the empty allocation is a
453 /// cheap default and the lookup work is skipped entirely.
454 fn computed_style_is_date(&self) -> Vec<bool> {
455 if matches!(self.date_interpretation, DateInterpretation::CellType) {
456 return Vec::new();
457 }
458 crate::style::compute_style_is_date(&self.stylesheet)
459 }
460
461 /// Get the raw XML bytes for a sheet by index.
462 ///
463 /// When the OnceLock is uninitialised (Lazy/Stream deferred), raw bytes
464 /// from `raw_sheet_xml` are used so the DOM is never materialised. When
465 /// the OnceLock IS initialised (Eager parse or filtered-out sheet), the
466 /// parsed worksheet is serialised back so that `sheets(...)` filtering is
467 /// respected (filtered sheets have an empty worksheet placeholder).
468 ///
469 /// The returned bytes are cloned because the `SheetStreamReader` takes
470 /// ownership of its `BufRead` source.
471 fn sheet_xml_bytes(&self, idx: usize) -> Result<Vec<u8>> {
472 // If the OnceLock is already initialised (eager parse OR filtered-out
473 // placeholder), serialise whatever is stored there. This ensures
474 // filtered-out sheets yield an empty worksheet.
475 if let Some(ws) = self.worksheets[idx].1.get() {
476 let xml = quick_xml::se::to_string(ws)
477 .map_err(|e| Error::Internal(format!("failed to serialize worksheet: {e}")))?;
478 return Ok(xml.into_bytes());
479 }
480 // Lazy/Stream deferred: OnceLock not yet initialised, use raw bytes.
481 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
482 return Ok(bytes.clone());
483 }
484 Err(Error::Internal(format!(
485 "sheet at index {} has no materialized or deferred data",
486 idx
487 )))
488 }
489}