Skip to main content

sheetkit_core/
stream_reader.rs

1//! Forward-only streaming worksheet reader.
2//!
3//! [`SheetStreamReader`] reads worksheet XML row-by-row using event-driven
4//! parsing (`quick_xml::Reader`) without materializing the full DOM. This
5//! enables processing large worksheets with bounded memory by reading rows
6//! in batches.
7//!
8//! Shared string indices are resolved through a reference to the workbook's
9//! [`SharedStringTable`]. Cell types (string, number, boolean, date, formula,
10//! error, inline string) are handled according to the OOXML specification.
11
12use std::io::BufRead;
13
14use quick_xml::events::Event;
15use quick_xml::name::QName;
16
17use crate::cell::CellValue;
18use crate::error::{Error, Result};
19use crate::sst::SharedStringTable;
20use crate::utils::cell_ref::cell_name_to_coordinates;
21use crate::workbook::open_options::DateInterpretation;
22
23/// A single row produced by the streaming reader.
24#[derive(Debug, Clone)]
25pub struct StreamRow {
26    /// 1-based row number.
27    pub row_number: u32,
28    /// Cells in this row as (1-based column index, value) pairs.
29    pub cells: Vec<(u32, CellValue)>,
30}
31
32/// Forward-only streaming reader for worksheet XML.
33///
34/// Reads rows in batches without deserializing the entire worksheet into
35/// memory. The reader borrows the shared string table for resolving string
36/// cell references.
37pub struct SheetStreamReader<'a, R: BufRead> {
38    reader: quick_xml::Reader<R>,
39    sst: &'a SharedStringTable,
40    done: bool,
41    row_limit: Option<u32>,
42    rows_emitted: u32,
43    date_interpretation: DateInterpretation,
44    style_is_date: Vec<bool>,
45}
46
47impl<'a, R: BufRead> SheetStreamReader<'a, R> {
48    /// Create a new streaming reader over the given `BufRead` source.
49    ///
50    /// `sst` is a reference to the shared string table for resolving
51    /// shared string cell values. Optional configuration (row limit, date
52    /// interpretation) is applied via the builder methods.
53    pub fn new(source: R, sst: &'a SharedStringTable) -> Self {
54        let mut reader = quick_xml::Reader::from_reader(source);
55        reader.config_mut().trim_text(false);
56        Self {
57            reader,
58            sst,
59            done: false,
60            row_limit: None,
61            rows_emitted: 0,
62            date_interpretation: DateInterpretation::default(),
63            style_is_date: Vec::new(),
64        }
65    }
66
67    /// Cap the number of rows produced by this reader. `None` means no
68    /// limit (the default).
69    pub fn row_limit(mut self, limit: Option<u32>) -> Self {
70        self.row_limit = limit;
71        self
72    }
73
74    /// Configure how date-formatted number cells are interpreted.
75    ///
76    /// `style_is_date` is a per-style boolean produced by
77    /// [`crate::style::compute_style_is_date`]; it is indexed by each
78    /// cell's `s` attribute. Empty vectors are equivalent to "no style
79    /// was a date format", which is the safe default.
80    pub fn date_promotion(
81        mut self,
82        interpretation: DateInterpretation,
83        style_is_date: Vec<bool>,
84    ) -> Self {
85        self.date_interpretation = interpretation;
86        self.style_is_date = style_is_date;
87        self
88    }
89
90    /// Returns `true` when `style_idx` refers to a style whose number
91    /// format is a date format and the reader is configured with
92    /// [`DateInterpretation::NumFmt`].
93    fn should_promote_to_date(&self, style_idx: Option<u32>) -> bool {
94        if !matches!(self.date_interpretation, DateInterpretation::NumFmt) {
95            return false;
96        }
97        let idx = match style_idx {
98            Some(i) => i as usize,
99            None => return false,
100        };
101        self.style_is_date.get(idx).copied().unwrap_or(false)
102    }
103
104    /// Read the next batch of rows. Returns an empty `Vec` when there are no
105    /// more rows to read.
106    pub fn next_batch(&mut self, batch_size: usize) -> Result<Vec<StreamRow>> {
107        if self.done {
108            return Ok(Vec::new());
109        }
110
111        let mut rows = Vec::with_capacity(batch_size);
112        let mut buf = Vec::with_capacity(4096);
113
114        loop {
115            if rows.len() >= batch_size {
116                break;
117            }
118            if let Some(limit) = self.row_limit {
119                if self.rows_emitted >= limit {
120                    self.done = true;
121                    break;
122                }
123            }
124
125            buf.clear();
126            match self
127                .reader
128                .read_event_into(&mut buf)
129                .map_err(|e| Error::XmlParse(e.to_string()))?
130            {
131                Event::Start(ref e) if e.name() == QName(b"row") => {
132                    let row_number = extract_row_number(e)?;
133                    let row = self.parse_row_body(row_number)?;
134                    self.rows_emitted += 1;
135                    if !row.cells.is_empty() {
136                        rows.push(row);
137                    }
138                }
139                Event::Eof => {
140                    self.done = true;
141                    break;
142                }
143                _ => {}
144            }
145        }
146
147        Ok(rows)
148    }
149
150    /// Returns `true` if there are potentially more rows to read.
151    pub fn has_more(&self) -> bool {
152        !self.done
153    }
154
155    /// Close the reader and release resources.
156    pub fn close(self) {
157        drop(self);
158    }
159
160    /// Parse the body of a `<row>` element (its child `<c>` elements) after
161    /// the row number has been extracted from the opening tag.
162    fn parse_row_body(&mut self, row_number: u32) -> Result<StreamRow> {
163        let mut cells = Vec::new();
164        let mut buf = Vec::with_capacity(1024);
165
166        loop {
167            buf.clear();
168            match self
169                .reader
170                .read_event_into(&mut buf)
171                .map_err(|e| Error::XmlParse(e.to_string()))?
172            {
173                Event::Start(ref e) if e.name() == QName(b"c") => {
174                    let (col, cell_type, style_idx) = extract_cell_attrs(e)?;
175                    if let Some(col) = col {
176                        let promote = self.should_promote_to_date(style_idx);
177                        let cv = self.parse_cell_body(cell_type.as_deref(), promote)?;
178                        cells.push((col, cv));
179                    } else {
180                        self.skip_to_end_of(b"c")?;
181                    }
182                }
183                Event::Empty(ref e) if e.name() == QName(b"c") => {
184                    let (col, cell_type, style_idx) = extract_cell_attrs(e)?;
185                    if let Some(col) = col {
186                        let promote = self.should_promote_to_date(style_idx);
187                        let cv = resolve_cell_value(
188                            self.sst,
189                            cell_type.as_deref(),
190                            None,
191                            None,
192                            None,
193                            promote,
194                        )?;
195                        cells.push((col, cv));
196                    }
197                }
198                Event::End(ref e) if e.name() == QName(b"row") => break,
199                Event::Eof => {
200                    self.done = true;
201                    break;
202                }
203                _ => {}
204            }
205        }
206
207        Ok(StreamRow { row_number, cells })
208    }
209
210    /// Parse the body of a `<c>` element (its child `<v>`, `<f>`, `<is>`
211    /// elements) after the cell attributes have been extracted.
212    fn parse_cell_body(
213        &mut self,
214        cell_type: Option<&str>,
215        promote_to_date: bool,
216    ) -> Result<CellValue> {
217        let mut value_text: Option<String> = None;
218        let mut formula_text: Option<String> = None;
219        let mut inline_string: Option<String> = None;
220        let mut buf = Vec::with_capacity(512);
221        let mut in_is = false;
222
223        loop {
224            buf.clear();
225            match self
226                .reader
227                .read_event_into(&mut buf)
228                .map_err(|e| Error::XmlParse(e.to_string()))?
229            {
230                Event::Start(ref e) => {
231                    let local = e.local_name();
232                    if local.as_ref() == b"v" {
233                        value_text = Some(self.read_text_content(b"v")?);
234                    } else if local.as_ref() == b"f" {
235                        formula_text = Some(self.read_text_content(b"f")?);
236                    } else if local.as_ref() == b"is" {
237                        in_is = true;
238                        inline_string = Some(String::new());
239                    } else if local.as_ref() == b"t" && in_is {
240                        let t = self.read_text_content(b"t")?;
241                        if let Some(ref mut is) = inline_string {
242                            is.push_str(&t);
243                        }
244                    }
245                }
246                Event::End(ref e) => {
247                    let local = e.local_name();
248                    if local.as_ref() == b"c" {
249                        break;
250                    }
251                    if local.as_ref() == b"is" {
252                        in_is = false;
253                    }
254                }
255                Event::Eof => {
256                    self.done = true;
257                    break;
258                }
259                _ => {}
260            }
261        }
262
263        resolve_cell_value(
264            self.sst,
265            cell_type,
266            value_text.as_deref(),
267            formula_text,
268            inline_string,
269            promote_to_date,
270        )
271    }
272
273    /// Read text content between an opening tag and its matching closing tag.
274    fn read_text_content(&mut self, end_tag: &[u8]) -> Result<String> {
275        let mut text = String::new();
276        let mut buf = Vec::with_capacity(256);
277        loop {
278            buf.clear();
279            match self
280                .reader
281                .read_event_into(&mut buf)
282                .map_err(|e| Error::XmlParse(e.to_string()))?
283            {
284                Event::Text(ref e) => {
285                    let decoded = e.unescape().map_err(|e| Error::XmlParse(e.to_string()))?;
286                    text.push_str(&decoded);
287                }
288                Event::End(ref e) if e.local_name().as_ref() == end_tag => break,
289                Event::Eof => {
290                    self.done = true;
291                    break;
292                }
293                _ => {}
294            }
295        }
296        Ok(text)
297    }
298
299    /// Skip all events until the matching end tag for the given element.
300    fn skip_to_end_of(&mut self, tag: &[u8]) -> Result<()> {
301        let mut buf = Vec::with_capacity(256);
302        let mut depth: u32 = 1;
303        loop {
304            buf.clear();
305            match self
306                .reader
307                .read_event_into(&mut buf)
308                .map_err(|e| Error::XmlParse(e.to_string()))?
309            {
310                Event::Start(ref e) if e.local_name().as_ref() == tag => {
311                    depth += 1;
312                }
313                Event::End(ref e) if e.local_name().as_ref() == tag => {
314                    depth -= 1;
315                    if depth == 0 {
316                        break;
317                    }
318                }
319                Event::Eof => {
320                    self.done = true;
321                    break;
322                }
323                _ => {}
324            }
325        }
326        Ok(())
327    }
328}
329
330/// Owning variant of [`SheetStreamReader`] for use in FFI contexts where
331/// lifetime parameters are not supported (e.g., napi classes).
332///
333/// Stores its own copy of the shared string table and the XML byte source,
334/// avoiding any borrowed references. The parsing logic delegates to the same
335/// free functions used by the borrowed reader.
336pub struct OwnedSheetStreamReader {
337    reader: quick_xml::Reader<std::io::BufReader<std::io::Cursor<Vec<u8>>>>,
338    sst: SharedStringTable,
339    done: bool,
340    row_limit: Option<u32>,
341    rows_emitted: u32,
342    date_interpretation: DateInterpretation,
343    style_is_date: Vec<bool>,
344}
345
346impl OwnedSheetStreamReader {
347    /// Create a new owned streaming reader.
348    ///
349    /// `xml_bytes` is the raw worksheet XML. `sst` is a read-only clone of
350    /// the shared string table. Configuration is applied via builder
351    /// methods ([`Self::row_limit`], [`Self::date_promotion`]).
352    pub fn new(xml_bytes: Vec<u8>, sst: SharedStringTable) -> Self {
353        let cursor = std::io::Cursor::new(xml_bytes);
354        let buf_reader = std::io::BufReader::new(cursor);
355        let mut reader = quick_xml::Reader::from_reader(buf_reader);
356        reader.config_mut().trim_text(false);
357        Self {
358            reader,
359            sst,
360            done: false,
361            row_limit: None,
362            rows_emitted: 0,
363            date_interpretation: DateInterpretation::default(),
364            style_is_date: Vec::new(),
365        }
366    }
367
368    /// Cap the number of rows produced by this reader. `None` means no
369    /// limit (the default).
370    pub fn row_limit(mut self, limit: Option<u32>) -> Self {
371        self.row_limit = limit;
372        self
373    }
374
375    /// Configure how date-formatted number cells are interpreted.
376    /// See [`SheetStreamReader::date_promotion`] for details.
377    pub fn date_promotion(
378        mut self,
379        interpretation: DateInterpretation,
380        style_is_date: Vec<bool>,
381    ) -> Self {
382        self.date_interpretation = interpretation;
383        self.style_is_date = style_is_date;
384        self
385    }
386
387    fn should_promote_to_date(&self, style_idx: Option<u32>) -> bool {
388        if !matches!(self.date_interpretation, DateInterpretation::NumFmt) {
389            return false;
390        }
391        let idx = match style_idx {
392            Some(i) => i as usize,
393            None => return false,
394        };
395        self.style_is_date.get(idx).copied().unwrap_or(false)
396    }
397
398    /// Read the next batch of rows. Returns an empty `Vec` when there are no
399    /// more rows to read.
400    pub fn next_batch(&mut self, batch_size: usize) -> Result<Vec<StreamRow>> {
401        if self.done {
402            return Ok(Vec::new());
403        }
404
405        let mut rows = Vec::with_capacity(batch_size);
406        let mut buf = Vec::with_capacity(4096);
407
408        loop {
409            if rows.len() >= batch_size {
410                break;
411            }
412            if let Some(limit) = self.row_limit {
413                if self.rows_emitted >= limit {
414                    self.done = true;
415                    break;
416                }
417            }
418
419            buf.clear();
420            match self
421                .reader
422                .read_event_into(&mut buf)
423                .map_err(|e| Error::XmlParse(e.to_string()))?
424            {
425                Event::Start(ref e) if e.name() == QName(b"row") => {
426                    let row_number = extract_row_number(e)?;
427                    let row = self.parse_row_body(row_number)?;
428                    self.rows_emitted += 1;
429                    if !row.cells.is_empty() {
430                        rows.push(row);
431                    }
432                }
433                Event::Eof => {
434                    self.done = true;
435                    break;
436                }
437                _ => {}
438            }
439        }
440
441        Ok(rows)
442    }
443
444    /// Returns `true` if there are potentially more rows to read.
445    pub fn has_more(&self) -> bool {
446        !self.done
447    }
448
449    /// Close the reader and release resources.
450    pub fn close(self) {
451        drop(self);
452    }
453
454    fn parse_row_body(&mut self, row_number: u32) -> Result<StreamRow> {
455        let mut cells = Vec::new();
456        let mut buf = Vec::with_capacity(1024);
457
458        loop {
459            buf.clear();
460            match self
461                .reader
462                .read_event_into(&mut buf)
463                .map_err(|e| Error::XmlParse(e.to_string()))?
464            {
465                Event::Start(ref e) if e.name() == QName(b"c") => {
466                    let (col, cell_type, style_idx) = extract_cell_attrs(e)?;
467                    if let Some(col) = col {
468                        let promote = self.should_promote_to_date(style_idx);
469                        let cv = self.parse_cell_body(cell_type.as_deref(), promote)?;
470                        cells.push((col, cv));
471                    } else {
472                        self.skip_to_end_of(b"c")?;
473                    }
474                }
475                Event::Empty(ref e) if e.name() == QName(b"c") => {
476                    let (col, cell_type, style_idx) = extract_cell_attrs(e)?;
477                    if let Some(col) = col {
478                        let promote = self.should_promote_to_date(style_idx);
479                        let cv = resolve_cell_value(
480                            &self.sst,
481                            cell_type.as_deref(),
482                            None,
483                            None,
484                            None,
485                            promote,
486                        )?;
487                        cells.push((col, cv));
488                    }
489                }
490                Event::End(ref e) if e.name() == QName(b"row") => break,
491                Event::Eof => {
492                    self.done = true;
493                    break;
494                }
495                _ => {}
496            }
497        }
498
499        Ok(StreamRow { row_number, cells })
500    }
501
502    fn parse_cell_body(
503        &mut self,
504        cell_type: Option<&str>,
505        promote_to_date: bool,
506    ) -> Result<CellValue> {
507        let mut value_text: Option<String> = None;
508        let mut formula_text: Option<String> = None;
509        let mut inline_string: Option<String> = None;
510        let mut buf = Vec::with_capacity(512);
511        let mut in_is = false;
512
513        loop {
514            buf.clear();
515            match self
516                .reader
517                .read_event_into(&mut buf)
518                .map_err(|e| Error::XmlParse(e.to_string()))?
519            {
520                Event::Start(ref e) => {
521                    let local = e.local_name();
522                    if local.as_ref() == b"v" {
523                        value_text = Some(self.read_text_content(b"v")?);
524                    } else if local.as_ref() == b"f" {
525                        formula_text = Some(self.read_text_content(b"f")?);
526                    } else if local.as_ref() == b"is" {
527                        in_is = true;
528                        inline_string = Some(String::new());
529                    } else if local.as_ref() == b"t" && in_is {
530                        let t = self.read_text_content(b"t")?;
531                        if let Some(ref mut is) = inline_string {
532                            is.push_str(&t);
533                        }
534                    }
535                }
536                Event::End(ref e) => {
537                    let local = e.local_name();
538                    if local.as_ref() == b"c" {
539                        break;
540                    }
541                    if local.as_ref() == b"is" {
542                        in_is = false;
543                    }
544                }
545                Event::Eof => {
546                    self.done = true;
547                    break;
548                }
549                _ => {}
550            }
551        }
552
553        resolve_cell_value(
554            &self.sst,
555            cell_type,
556            value_text.as_deref(),
557            formula_text,
558            inline_string,
559            promote_to_date,
560        )
561    }
562
563    fn read_text_content(&mut self, end_tag: &[u8]) -> Result<String> {
564        let mut text = String::new();
565        let mut buf = Vec::with_capacity(256);
566        loop {
567            buf.clear();
568            match self
569                .reader
570                .read_event_into(&mut buf)
571                .map_err(|e| Error::XmlParse(e.to_string()))?
572            {
573                Event::Text(ref e) => {
574                    let decoded = e.unescape().map_err(|e| Error::XmlParse(e.to_string()))?;
575                    text.push_str(&decoded);
576                }
577                Event::End(ref e) if e.local_name().as_ref() == end_tag => break,
578                Event::Eof => {
579                    self.done = true;
580                    break;
581                }
582                _ => {}
583            }
584        }
585        Ok(text)
586    }
587
588    fn skip_to_end_of(&mut self, tag: &[u8]) -> Result<()> {
589        let mut buf = Vec::with_capacity(256);
590        let mut depth: u32 = 1;
591        loop {
592            buf.clear();
593            match self
594                .reader
595                .read_event_into(&mut buf)
596                .map_err(|e| Error::XmlParse(e.to_string()))?
597            {
598                Event::Start(ref e) if e.local_name().as_ref() == tag => {
599                    depth += 1;
600                }
601                Event::End(ref e) if e.local_name().as_ref() == tag => {
602                    depth -= 1;
603                    if depth == 0 {
604                        break;
605                    }
606                }
607                Event::Eof => {
608                    self.done = true;
609                    break;
610                }
611                _ => {}
612            }
613        }
614        Ok(())
615    }
616}
617
618/// Extract the `r` (row number) attribute from a `<row>` element.
619fn extract_row_number(start: &quick_xml::events::BytesStart<'_>) -> Result<u32> {
620    for attr in start.attributes().flatten() {
621        if attr.key == QName(b"r") {
622            let val =
623                std::str::from_utf8(&attr.value).map_err(|e| Error::XmlParse(e.to_string()))?;
624            return val
625                .parse::<u32>()
626                .map_err(|e| Error::XmlParse(format!("invalid row number: {e}")));
627        }
628    }
629    Err(Error::XmlParse(
630        "row element missing r attribute".to_string(),
631    ))
632}
633
634/// Extract the cell reference (column index), type attribute, and style
635/// index from a `<c>` element.
636fn extract_cell_attrs(
637    start: &quick_xml::events::BytesStart<'_>,
638) -> Result<(Option<u32>, Option<String>, Option<u32>)> {
639    let mut cell_ref: Option<String> = None;
640    let mut cell_type: Option<String> = None;
641    let mut style_idx: Option<u32> = None;
642
643    for attr in start.attributes().flatten() {
644        match attr.key {
645            QName(b"r") => {
646                cell_ref = Some(
647                    std::str::from_utf8(&attr.value)
648                        .map_err(|e| Error::XmlParse(e.to_string()))?
649                        .to_string(),
650                );
651            }
652            QName(b"t") => {
653                cell_type = Some(
654                    std::str::from_utf8(&attr.value)
655                        .map_err(|e| Error::XmlParse(e.to_string()))?
656                        .to_string(),
657                );
658            }
659            QName(b"s") => {
660                let raw =
661                    std::str::from_utf8(&attr.value).map_err(|e| Error::XmlParse(e.to_string()))?;
662                style_idx = raw.parse::<u32>().ok();
663            }
664            _ => {}
665        }
666    }
667
668    let col = match &cell_ref {
669        Some(r) => Some(cell_name_to_coordinates(r)?.0),
670        None => None,
671    };
672
673    Ok((col, cell_type, style_idx))
674}
675
676/// Resolve cell type, value text, formula, and inline string into a `CellValue`.
677///
678/// `promote_to_date` is honored only for non-formula numeric cells (`t="n"`
679/// or untyped). When true, such cells are returned as [`CellValue::Date`]
680/// instead of [`CellValue::Number`], used by the
681/// [`DateInterpretation::NumFmt`] path of the streaming reader.
682fn resolve_cell_value(
683    sst: &SharedStringTable,
684    cell_type: Option<&str>,
685    value_text: Option<&str>,
686    formula_text: Option<String>,
687    inline_string: Option<String>,
688    promote_to_date: bool,
689) -> Result<CellValue> {
690    if let Some(formula) = formula_text {
691        let cached = match (cell_type, value_text) {
692            (Some("b"), Some(v)) => Some(Box::new(CellValue::Bool(v == "1"))),
693            (Some("e"), Some(v)) => Some(Box::new(CellValue::Error(v.to_string()))),
694            (Some("str"), Some(v)) => Some(Box::new(CellValue::String(v.to_string()))),
695            (_, Some(v)) => v
696                .parse::<f64>()
697                .ok()
698                .map(|n| Box::new(CellValue::Number(n))),
699            _ => None,
700        };
701        return Ok(CellValue::Formula {
702            expr: formula,
703            result: cached,
704        });
705    }
706
707    match (cell_type, value_text) {
708        (Some("s"), Some(v)) => {
709            let idx: usize = v
710                .parse()
711                .map_err(|_| Error::Internal(format!("invalid SST index: {v}")))?;
712            let s = sst
713                .get(idx)
714                .ok_or_else(|| Error::Internal(format!("SST index {idx} out of bounds")))?;
715            Ok(CellValue::String(s.to_string()))
716        }
717        (Some("b"), Some(v)) => Ok(CellValue::Bool(v == "1")),
718        (Some("e"), Some(v)) => Ok(CellValue::Error(v.to_string())),
719        (Some("inlineStr"), _) => Ok(CellValue::String(inline_string.unwrap_or_default())),
720        (Some("str"), Some(v)) => Ok(CellValue::String(v.to_string())),
721        (Some("d"), Some(v)) => {
722            let n: f64 = v
723                .parse()
724                .map_err(|_| Error::Internal(format!("invalid date value: {v}")))?;
725            Ok(CellValue::Date(n))
726        }
727        (Some("n") | None, Some(v)) => {
728            let n: f64 = v
729                .parse()
730                .map_err(|_| Error::Internal(format!("invalid number: {v}")))?;
731            if promote_to_date {
732                Ok(CellValue::Date(n))
733            } else {
734                Ok(CellValue::Number(n))
735            }
736        }
737        _ => Ok(CellValue::Empty),
738    }
739}
740
741#[cfg(test)]
742mod tests {
743    use super::*;
744    use std::io::Cursor;
745
746    fn make_sst(strings: &[&str]) -> SharedStringTable {
747        let mut sst = SharedStringTable::new();
748        for s in strings {
749            sst.add(s);
750        }
751        sst
752    }
753
754    fn worksheet_xml(sheet_data: &str) -> String {
755        format!(
756            r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
757<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
758<sheetData>
759{sheet_data}
760</sheetData>
761</worksheet>"#
762        )
763    }
764
765    fn read_all(xml: &str, sst: &SharedStringTable, row_limit: Option<u32>) -> Vec<StreamRow> {
766        let cursor = Cursor::new(xml.as_bytes().to_vec());
767        let mut reader = SheetStreamReader::new(cursor, sst).row_limit(row_limit);
768        let mut all = Vec::new();
769        loop {
770            let batch = reader.next_batch(100).unwrap();
771            if batch.is_empty() {
772                break;
773            }
774            all.extend(batch);
775        }
776        all
777    }
778
779    #[test]
780    fn test_basic_batch_reading() {
781        let sst = make_sst(&["Name", "Age"]);
782        let xml = worksheet_xml(
783            r#"
784<row r="1"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c></row>
785<row r="2"><c r="A2" t="s"><v>0</v></c><c r="B2"><v>30</v></c></row>
786<row r="3"><c r="A3" t="s"><v>0</v></c><c r="B3"><v>25</v></c></row>
787"#,
788        );
789
790        let cursor = Cursor::new(xml.as_bytes().to_vec());
791        let mut reader = SheetStreamReader::new(cursor, &sst);
792
793        let batch1 = reader.next_batch(2).unwrap();
794        assert_eq!(batch1.len(), 2);
795        assert!(reader.has_more());
796
797        let batch2 = reader.next_batch(2).unwrap();
798        assert_eq!(batch2.len(), 1);
799
800        let batch3 = reader.next_batch(2).unwrap();
801        assert!(batch3.is_empty());
802        assert!(!reader.has_more());
803    }
804
805    #[test]
806    fn test_sparse_rows() {
807        let sst = SharedStringTable::new();
808        let xml = worksheet_xml(
809            r#"
810<row r="1"><c r="A1"><v>1</v></c></row>
811<row r="5"><c r="C5"><v>5</v></c></row>
812<row r="100"><c r="A100"><v>100</v></c></row>
813"#,
814        );
815
816        let rows = read_all(&xml, &sst, None);
817        assert_eq!(rows.len(), 3);
818        assert_eq!(rows[0].row_number, 1);
819        assert_eq!(rows[1].row_number, 5);
820        assert_eq!(rows[1].cells[0].0, 3);
821        assert_eq!(rows[2].row_number, 100);
822    }
823
824    #[test]
825    fn test_all_cell_types() {
826        let sst = make_sst(&["Hello"]);
827        let xml = worksheet_xml(
828            r#"
829<row r="1">
830  <c r="A1" t="s"><v>0</v></c>
831  <c r="B1"><v>42.5</v></c>
832  <c r="C1" t="b"><v>1</v></c>
833  <c r="D1" t="e"><v>#DIV/0!</v></c>
834  <c r="E1" t="inlineStr"><is><t>Inline</t></is></c>
835  <c r="F1" t="n"><v>99</v></c>
836  <c r="G1" t="d"><v>45000</v></c>
837</row>
838"#,
839        );
840
841        let rows = read_all(&xml, &sst, None);
842        assert_eq!(rows.len(), 1);
843        let cells = &rows[0].cells;
844
845        assert_eq!(cells[0], (1, CellValue::String("Hello".to_string())));
846        assert_eq!(cells[1], (2, CellValue::Number(42.5)));
847        assert_eq!(cells[2], (3, CellValue::Bool(true)));
848        assert_eq!(cells[3], (4, CellValue::Error("#DIV/0!".to_string())));
849        assert_eq!(cells[4], (5, CellValue::String("Inline".to_string())));
850        assert_eq!(cells[5], (6, CellValue::Number(99.0)));
851        assert_eq!(cells[6], (7, CellValue::Date(45000.0)));
852    }
853
854    #[test]
855    fn test_boolean_false() {
856        let sst = SharedStringTable::new();
857        let xml = worksheet_xml(r#"<row r="1"><c r="A1" t="b"><v>0</v></c></row>"#);
858        let rows = read_all(&xml, &sst, None);
859        assert_eq!(rows[0].cells[0].1, CellValue::Bool(false));
860    }
861
862    #[test]
863    fn test_shared_string_resolution() {
864        let sst = make_sst(&["First", "Second", "Third"]);
865        let xml = worksheet_xml(
866            r#"
867<row r="1">
868  <c r="A1" t="s"><v>0</v></c>
869  <c r="B1" t="s"><v>1</v></c>
870  <c r="C1" t="s"><v>2</v></c>
871</row>
872"#,
873        );
874
875        let rows = read_all(&xml, &sst, None);
876        assert_eq!(rows[0].cells[0].1, CellValue::String("First".to_string()));
877        assert_eq!(rows[0].cells[1].1, CellValue::String("Second".to_string()));
878        assert_eq!(rows[0].cells[2].1, CellValue::String("Third".to_string()));
879    }
880
881    #[test]
882    fn test_shared_string_out_of_bounds() {
883        let sst = make_sst(&["Only"]);
884        let xml = worksheet_xml(r#"<row r="1"><c r="A1" t="s"><v>999</v></c></row>"#);
885
886        let cursor = Cursor::new(xml.as_bytes().to_vec());
887        let mut reader = SheetStreamReader::new(cursor, &sst);
888        let result = reader.next_batch(10);
889        assert!(result.is_err());
890    }
891
892    #[test]
893    fn test_row_limit() {
894        let sst = SharedStringTable::new();
895        let xml = worksheet_xml(
896            r#"
897<row r="1"><c r="A1"><v>1</v></c></row>
898<row r="2"><c r="A2"><v>2</v></c></row>
899<row r="3"><c r="A3"><v>3</v></c></row>
900<row r="4"><c r="A4"><v>4</v></c></row>
901<row r="5"><c r="A5"><v>5</v></c></row>
902"#,
903        );
904
905        let rows = read_all(&xml, &sst, Some(3));
906        assert_eq!(rows.len(), 3);
907        assert_eq!(rows[0].row_number, 1);
908        assert_eq!(rows[2].row_number, 3);
909    }
910
911    #[test]
912    fn test_row_limit_zero() {
913        let sst = SharedStringTable::new();
914        let xml = worksheet_xml(r#"<row r="1"><c r="A1"><v>1</v></c></row>"#);
915
916        let rows = read_all(&xml, &sst, Some(0));
917        assert!(rows.is_empty());
918    }
919
920    #[test]
921    fn test_empty_sheet() {
922        let sst = SharedStringTable::new();
923        let xml = worksheet_xml("");
924
925        let rows = read_all(&xml, &sst, None);
926        assert!(rows.is_empty());
927    }
928
929    #[test]
930    fn test_empty_rows_are_skipped() {
931        let sst = SharedStringTable::new();
932        let xml = worksheet_xml(
933            r#"
934<row r="1"></row>
935<row r="2"><c r="A2"><v>42</v></c></row>
936<row r="3"></row>
937"#,
938        );
939
940        let rows = read_all(&xml, &sst, None);
941        assert_eq!(rows.len(), 1);
942        assert_eq!(rows[0].row_number, 2);
943    }
944
945    #[test]
946    fn test_empty_rows_count_against_limit() {
947        let sst = SharedStringTable::new();
948        let xml = worksheet_xml(
949            r#"
950<row r="1"></row>
951<row r="2"></row>
952<row r="3"><c r="A3"><v>3</v></c></row>
953<row r="4"><c r="A4"><v>4</v></c></row>
954"#,
955        );
956
957        let rows = read_all(&xml, &sst, Some(2));
958        assert!(
959            rows.is_empty(),
960            "with limit=2 and 2 empty rows, no data rows should be returned"
961        );
962
963        let rows2 = read_all(&xml, &sst, Some(3));
964        assert_eq!(rows2.len(), 1);
965        assert_eq!(rows2[0].row_number, 3);
966    }
967
968    #[test]
969    fn test_formula_with_cached_number() {
970        let sst = SharedStringTable::new();
971        let xml = worksheet_xml(r#"<row r="1"><c r="A1"><f>SUM(B1:B10)</f><v>42</v></c></row>"#);
972
973        let rows = read_all(&xml, &sst, None);
974        match &rows[0].cells[0].1 {
975            CellValue::Formula { expr, result } => {
976                assert_eq!(expr, "SUM(B1:B10)");
977                assert_eq!(result.as_deref(), Some(&CellValue::Number(42.0)));
978            }
979            other => panic!("expected Formula, got {:?}", other),
980        }
981    }
982
983    #[test]
984    fn test_formula_with_cached_string() {
985        let sst = SharedStringTable::new();
986        let xml = worksheet_xml(
987            r#"<row r="1"><c r="A1" t="str"><f>CONCAT("a","b")</f><v>ab</v></c></row>"#,
988        );
989
990        let rows = read_all(&xml, &sst, None);
991        match &rows[0].cells[0].1 {
992            CellValue::Formula { expr, result } => {
993                assert_eq!(expr, r#"CONCAT("a","b")"#);
994                assert_eq!(
995                    result.as_deref(),
996                    Some(&CellValue::String("ab".to_string()))
997                );
998            }
999            other => panic!("expected Formula, got {:?}", other),
1000        }
1001    }
1002
1003    #[test]
1004    fn test_formula_with_cached_boolean() {
1005        let sst = SharedStringTable::new();
1006        let xml = worksheet_xml(r#"<row r="1"><c r="A1" t="b"><f>TRUE()</f><v>1</v></c></row>"#);
1007
1008        let rows = read_all(&xml, &sst, None);
1009        match &rows[0].cells[0].1 {
1010            CellValue::Formula { expr, result } => {
1011                assert_eq!(expr, "TRUE()");
1012                assert_eq!(result.as_deref(), Some(&CellValue::Bool(true)));
1013            }
1014            other => panic!("expected Formula, got {:?}", other),
1015        }
1016    }
1017
1018    #[test]
1019    fn test_formula_with_cached_error() {
1020        let sst = SharedStringTable::new();
1021        let xml = worksheet_xml(r#"<row r="1"><c r="A1" t="e"><f>1/0</f><v>#DIV/0!</v></c></row>"#);
1022
1023        let rows = read_all(&xml, &sst, None);
1024        match &rows[0].cells[0].1 {
1025            CellValue::Formula { expr, result } => {
1026                assert_eq!(expr, "1/0");
1027                assert_eq!(
1028                    result.as_deref(),
1029                    Some(&CellValue::Error("#DIV/0!".to_string()))
1030                );
1031            }
1032            other => panic!("expected Formula, got {:?}", other),
1033        }
1034    }
1035
1036    #[test]
1037    fn test_formula_without_cached_value() {
1038        let sst = SharedStringTable::new();
1039        let xml = worksheet_xml(r#"<row r="1"><c r="A1"><f>A2+A3</f></c></row>"#);
1040
1041        let rows = read_all(&xml, &sst, None);
1042        match &rows[0].cells[0].1 {
1043            CellValue::Formula { expr, result } => {
1044                assert_eq!(expr, "A2+A3");
1045                assert!(result.is_none());
1046            }
1047            other => panic!("expected Formula, got {:?}", other),
1048        }
1049    }
1050
1051    #[test]
1052    fn test_inline_string_with_rich_text_runs() {
1053        let sst = SharedStringTable::new();
1054        let xml = worksheet_xml(
1055            r#"<row r="1"><c r="A1" t="inlineStr"><is><r><t>Bold</t></r><r><t> Normal</t></r></is></c></row>"#,
1056        );
1057
1058        let rows = read_all(&xml, &sst, None);
1059        assert_eq!(
1060            rows[0].cells[0].1,
1061            CellValue::String("Bold Normal".to_string())
1062        );
1063    }
1064
1065    #[test]
1066    fn test_reader_close() {
1067        let sst = SharedStringTable::new();
1068        let xml = worksheet_xml(r#"<row r="1"><c r="A1"><v>1</v></c></row>"#);
1069        let cursor = Cursor::new(xml.as_bytes().to_vec());
1070        let reader = SheetStreamReader::new(cursor, &sst);
1071        reader.close();
1072    }
1073
1074    #[test]
1075    fn test_reader_drop_without_reading_all() {
1076        let sst = SharedStringTable::new();
1077        let xml = worksheet_xml(
1078            r#"
1079<row r="1"><c r="A1"><v>1</v></c></row>
1080<row r="2"><c r="A2"><v>2</v></c></row>
1081"#,
1082        );
1083        let cursor = Cursor::new(xml.as_bytes().to_vec());
1084        let mut reader = SheetStreamReader::new(cursor, &sst);
1085        let batch = reader.next_batch(1).unwrap();
1086        assert_eq!(batch.len(), 1);
1087        drop(reader);
1088    }
1089
1090    #[test]
1091    fn test_has_more_transitions() {
1092        let sst = SharedStringTable::new();
1093        let xml = worksheet_xml(r#"<row r="1"><c r="A1"><v>1</v></c></row>"#);
1094
1095        let cursor = Cursor::new(xml.as_bytes().to_vec());
1096        let mut reader = SheetStreamReader::new(cursor, &sst);
1097        assert!(reader.has_more());
1098
1099        let batch = reader.next_batch(100).unwrap();
1100        assert_eq!(batch.len(), 1);
1101
1102        let batch2 = reader.next_batch(100).unwrap();
1103        assert!(batch2.is_empty());
1104        assert!(!reader.has_more());
1105    }
1106
1107    #[test]
1108    fn test_batch_size_one() {
1109        let sst = SharedStringTable::new();
1110        let xml = worksheet_xml(
1111            r#"
1112<row r="1"><c r="A1"><v>1</v></c></row>
1113<row r="2"><c r="A2"><v>2</v></c></row>
1114<row r="3"><c r="A3"><v>3</v></c></row>
1115"#,
1116        );
1117
1118        let cursor = Cursor::new(xml.as_bytes().to_vec());
1119        let mut reader = SheetStreamReader::new(cursor, &sst);
1120
1121        for expected_row in 1..=3 {
1122            let batch = reader.next_batch(1).unwrap();
1123            assert_eq!(batch.len(), 1);
1124            assert_eq!(batch[0].row_number, expected_row);
1125        }
1126
1127        let batch = reader.next_batch(1).unwrap();
1128        assert!(batch.is_empty());
1129    }
1130
1131    #[test]
1132    fn test_cell_with_no_value() {
1133        let sst = SharedStringTable::new();
1134        let xml = worksheet_xml(r#"<row r="1"><c r="A1"></c><c r="B1"><v>42</v></c></row>"#);
1135
1136        let rows = read_all(&xml, &sst, None);
1137        assert_eq!(rows[0].cells.len(), 2);
1138        assert_eq!(rows[0].cells[0].1, CellValue::Empty);
1139        assert_eq!(rows[0].cells[1].1, CellValue::Number(42.0));
1140    }
1141
1142    #[test]
1143    fn test_self_closing_cell_element() {
1144        let sst = SharedStringTable::new();
1145        let xml = worksheet_xml(
1146            r#"<row r="1"><c r="A1"/><c r="B1"><v>42</v></c><c r="C1" t="b"/></row>"#,
1147        );
1148
1149        let rows = read_all(&xml, &sst, None);
1150        assert_eq!(rows[0].cells.len(), 3);
1151        assert_eq!(rows[0].cells[0], (1, CellValue::Empty));
1152        assert_eq!(rows[0].cells[1], (2, CellValue::Number(42.0)));
1153        assert_eq!(rows[0].cells[2], (3, CellValue::Empty));
1154    }
1155
1156    #[test]
1157    fn test_integration_with_saved_workbook() {
1158        let mut wb = crate::workbook::Workbook::new();
1159        wb.set_cell_value("Sheet1", "A1", "Name").unwrap();
1160        wb.set_cell_value("Sheet1", "B1", "Score").unwrap();
1161        wb.set_cell_value("Sheet1", "A2", "Alice").unwrap();
1162        wb.set_cell_value("Sheet1", "B2", 95.5f64).unwrap();
1163        wb.set_cell_value("Sheet1", "A3", "Bob").unwrap();
1164        wb.set_cell_value("Sheet1", "B3", 87.0f64).unwrap();
1165
1166        let dir = tempfile::TempDir::new().unwrap();
1167        let path = dir.path().join("stream_reader_test.xlsx");
1168        wb.save(&path).unwrap();
1169
1170        let wb2 = crate::workbook::Workbook::open_with_options(
1171            &path,
1172            &crate::workbook::OpenOptions::new().read_mode(crate::workbook::ReadMode::Lazy),
1173        )
1174        .unwrap();
1175
1176        let mut reader = wb2.open_sheet_reader("Sheet1").unwrap();
1177        let rows = reader.next_batch(100).unwrap();
1178
1179        assert_eq!(rows.len(), 3);
1180        assert_eq!(rows[0].row_number, 1);
1181        assert_eq!(rows[0].cells[0].1, CellValue::String("Name".to_string()));
1182        assert_eq!(rows[0].cells[1].1, CellValue::String("Score".to_string()));
1183        assert_eq!(rows[1].cells[0].1, CellValue::String("Alice".to_string()));
1184        assert_eq!(rows[1].cells[1].1, CellValue::Number(95.5));
1185        assert_eq!(rows[2].cells[0].1, CellValue::String("Bob".to_string()));
1186        assert_eq!(rows[2].cells[1].1, CellValue::Number(87.0));
1187    }
1188
1189    #[test]
1190    fn test_integration_with_row_limit() {
1191        let mut wb = crate::workbook::Workbook::new();
1192        for i in 1..=10 {
1193            let cell = format!("A{i}");
1194            wb.set_cell_value("Sheet1", &cell, i as f64).unwrap();
1195        }
1196
1197        let dir = tempfile::TempDir::new().unwrap();
1198        let path = dir.path().join("stream_limit_test.xlsx");
1199        wb.save(&path).unwrap();
1200
1201        let wb2 = crate::workbook::Workbook::open_with_options(
1202            &path,
1203            &crate::workbook::OpenOptions::new()
1204                .read_mode(crate::workbook::ReadMode::Lazy)
1205                .sheet_rows(5),
1206        )
1207        .unwrap();
1208
1209        let mut reader = wb2.open_sheet_reader("Sheet1").unwrap();
1210        let mut all_rows = Vec::new();
1211        loop {
1212            let batch = reader.next_batch(3).unwrap();
1213            if batch.is_empty() {
1214                break;
1215            }
1216            all_rows.extend(batch);
1217        }
1218
1219        assert_eq!(all_rows.len(), 5);
1220        assert_eq!(all_rows[4].row_number, 5);
1221    }
1222
1223    #[test]
1224    fn test_integration_sheet_not_found() {
1225        let wb = crate::workbook::Workbook::new();
1226        let result = wb.open_sheet_reader("NonExistent");
1227        assert!(result.is_err());
1228    }
1229
1230    /// End-to-end date interpretation test harness. Builds a workbook with
1231    /// mixed styled cells, saves it, reopens under both policies, and lets
1232    /// the caller assert on the returned cell values.
1233    fn write_mixed_style_workbook(path: &std::path::Path) {
1234        use crate::style::{builtin_num_fmts, NumFmtStyle, Style};
1235        let mut wb = crate::workbook::Workbook::new();
1236        let builtin_date_style = wb
1237            .add_style(&Style {
1238                num_fmt: Some(NumFmtStyle::Builtin(builtin_num_fmts::DATE_MDY)),
1239                ..Style::default()
1240            })
1241            .unwrap();
1242        let custom_date_style = wb
1243            .add_style(&Style {
1244                num_fmt: Some(NumFmtStyle::Custom("yyyy-mm-dd hh:mm".to_string())),
1245                ..Style::default()
1246            })
1247            .unwrap();
1248        let decimal_style = wb
1249            .add_style(&Style {
1250                num_fmt: Some(NumFmtStyle::Builtin(builtin_num_fmts::DECIMAL_2)),
1251                ..Style::default()
1252            })
1253            .unwrap();
1254
1255        // A1: number + built-in date format.
1256        wb.set_cell_value("Sheet1", "A1", 46127.0_f64).unwrap();
1257        wb.set_cell_style("Sheet1", "A1", builtin_date_style)
1258            .unwrap();
1259        // B1: number + custom date-like format.
1260        wb.set_cell_value("Sheet1", "B1", 46127.9993_f64).unwrap();
1261        wb.set_cell_style("Sheet1", "B1", custom_date_style)
1262            .unwrap();
1263        // C1: number + non-date format (decimal 2).
1264        wb.set_cell_value("Sheet1", "C1", 2.5_f64).unwrap();
1265        wb.set_cell_style("Sheet1", "C1", decimal_style).unwrap();
1266        // D1: number with no explicit style.
1267        wb.set_cell_value("Sheet1", "D1", 42.0_f64).unwrap();
1268
1269        wb.save(path).unwrap();
1270    }
1271
1272    #[test]
1273    fn test_integration_date_interpretation_cell_type_opt_in() {
1274        let dir = tempfile::TempDir::new().unwrap();
1275        let path = dir.path().join("dates_cell_type.xlsx");
1276        write_mixed_style_workbook(&path);
1277
1278        // Explicitly opt into spec-literal interpretation; the default is
1279        // `NumFmt` so this has to be requested by the caller.
1280        let wb = crate::workbook::Workbook::open_with_options(
1281            &path,
1282            &crate::workbook::OpenOptions::new()
1283                .read_mode(crate::workbook::ReadMode::Lazy)
1284                .date_interpretation(crate::workbook::DateInterpretation::CellType),
1285        )
1286        .unwrap();
1287        let mut reader = wb.open_sheet_reader("Sheet1").unwrap();
1288        let rows = reader.next_batch(10).unwrap();
1289
1290        // All number cells remain Number when CellType is requested, regardless
1291        // of whether their style references a date format.
1292        assert_eq!(rows[0].cells[0].1, CellValue::Number(46127.0));
1293        match &rows[0].cells[1].1 {
1294            CellValue::Number(v) => assert!((*v - 46127.9993).abs() < 1e-9),
1295            other => panic!("expected Number, got {:?}", other),
1296        }
1297        assert_eq!(rows[0].cells[2].1, CellValue::Number(2.5));
1298        assert_eq!(rows[0].cells[3].1, CellValue::Number(42.0));
1299    }
1300
1301    #[test]
1302    fn test_integration_date_interpretation_num_fmt_promotes_date_styles() {
1303        let dir = tempfile::TempDir::new().unwrap();
1304        let path = dir.path().join("dates_num_fmt.xlsx");
1305        write_mixed_style_workbook(&path);
1306
1307        let wb = crate::workbook::Workbook::open_with_options(
1308            &path,
1309            &crate::workbook::OpenOptions::new()
1310                .read_mode(crate::workbook::ReadMode::Lazy)
1311                .date_interpretation(crate::workbook::DateInterpretation::NumFmt),
1312        )
1313        .unwrap();
1314        let mut reader = wb.open_sheet_reader("Sheet1").unwrap();
1315        let rows = reader.next_batch(10).unwrap();
1316
1317        // A1: built-in date format → promoted to Date.
1318        assert_eq!(rows[0].cells[0].1, CellValue::Date(46127.0));
1319        // B1: custom date format code → promoted to Date.
1320        match &rows[0].cells[1].1 {
1321            CellValue::Date(v) => assert!((*v - 46127.9993).abs() < 1e-9),
1322            other => panic!("expected Date, got {:?}", other),
1323        }
1324        // C1: non-date format → stays Number.
1325        assert_eq!(rows[0].cells[2].1, CellValue::Number(2.5));
1326        // D1: no style → stays Number.
1327        assert_eq!(rows[0].cells[3].1, CellValue::Number(42.0));
1328    }
1329}