excelstream/
streaming_reader.rs

1//! Streaming reader for XLSX files with optimized memory usage
2//!
3//! This module provides a reader that processes data row-by-row with an iterator interface.
4//!
5//! **Memory Usage:**
6//! - Shared Strings Table (SST): Loaded fully (~3-5 MB for typical files)
7//! - Worksheet XML: Loaded fully from ZIP (uncompressed size)
8//! - Total memory ≈ SST + Uncompressed XML size
9//!
10//! **Important Notes:**
11//! - XLSX files are compressed. A 86 MB file may contain 1.2 GB uncompressed XML
12//! - For small-medium files (< 100 MB): Memory usage is reasonable
13//! - For large files with huge XML: Memory = uncompressed XML size
14//! - Still faster than calamine (no style parsing) and uses optimized SST
15//!
16//! **Trade-offs:**
17//! - Only supports simple XLSX files (no complex formatting)
18//! - Sequential read only (can't jump to random rows)
19//! - Best for: Fast iteration, simple data extraction, no formatting needs
20
21use crate::error::{ExcelError, Result};
22use crate::fast_writer::StreamingZipReader;
23use crate::types::{CellValue, Row};
24use std::io::{BufReader, Read};
25use std::path::Path;
26
27/// Streaming reader for XLSX files
28///
29/// **Memory Usage:**
30/// - SST (Shared Strings): Loaded fully (typically 3-5 MB)
31/// - Worksheet XML: Loaded from ZIP (uncompressed size)
32/// - Total ≈ SST + Uncompressed XML size
33///
34/// **Performance:**
35/// - 60K-85K rows/sec depending on file size
36/// - Faster than calamine (no style/format parsing)
37/// - Optimized hybrid SST
38///
39/// **Best for:**
40/// - Small to medium files (< 100 MB compressed)
41/// - Files with small SST but many rows
42/// - Simple data extraction without formatting
43pub struct StreamingReader {
44    archive: StreamingZipReader,
45    sst: Vec<String>,
46    sheet_names: Vec<String>,
47    sheet_paths: Vec<String>,
48}
49
50impl StreamingReader {
51    /// Open XLSX file for streaming read
52    ///
53    /// # Memory Usage
54    ///
55    /// - Loads SST (Shared Strings Table) fully into memory
56    /// - Worksheet data loaded as single XML string (uncompressed size)
57    /// - For 86 MB file: May use ~1.2 GB if XML is large
58    /// - For smaller files (< 50 MB): Usually reasonable memory
59    ///
60    /// # Performance
61    ///
62    /// - Fast: 60K-85K rows/sec
63    /// - No style/format parsing overhead
64    /// - Optimized for simple data extraction
65    ///
66    /// # Example
67    ///
68    /// ```no_run
69    /// use excelstream::streaming_reader::StreamingReader;
70    ///
71    /// let reader = StreamingReader::open("large.xlsx")?;
72    /// // SST loaded, ready to stream rows
73    /// # Ok::<(), Box<dyn std::error::Error>>(())
74    /// ```
75    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
76        let mut archive = StreamingZipReader::open(path)
77            .map_err(|e| ExcelError::ReadError(format!("Failed to open ZIP: {}", e)))?;
78
79        // Load Shared Strings Table (can't avoid this)
80        let sst = Self::load_shared_strings(&mut archive)?;
81
82        println!(
83            "📊 Loaded {} shared strings (~{:.2} MB in memory)",
84            sst.len(),
85            Self::estimate_sst_size(&sst) as f64 / (1024.0 * 1024.0)
86        );
87
88        // Load sheet names and paths from workbook.xml
89        let (sheet_names, sheet_paths) = Self::load_sheet_info(&mut archive)?;
90
91        println!("📋 Found {} sheets: {:?}", sheet_names.len(), sheet_names);
92
93        Ok(StreamingReader {
94            archive,
95            sst,
96            sheet_names,
97            sheet_paths,
98        })
99    }
100
101    /// Get list of sheet names
102    ///
103    /// Returns the names of all worksheets in the workbook.
104    ///
105    /// # Example
106    ///
107    /// ```no_run
108    /// use excelstream::ExcelReader;
109    ///
110    /// let reader = ExcelReader::open("workbook.xlsx")?;
111    /// for sheet_name in reader.sheet_names() {
112    ///     println!("Sheet: {}", sheet_name);
113    /// }
114    /// # Ok::<(), Box<dyn std::error::Error>>(())
115    /// ```
116    pub fn sheet_names(&self) -> Vec<String> {
117        self.sheet_names.clone()
118    }
119
120    /// Read rows by sheet index (for backward compatibility)
121    ///
122    /// # Arguments
123    /// * `sheet_index` - Zero-based sheet index (0 = first sheet)
124    ///
125    /// # Returns
126    /// Iterator of Row structs
127    pub fn rows_by_index(&mut self, sheet_index: usize) -> Result<RowStructIterator<'_>> {
128        let sheet_name = self
129            .sheet_names
130            .get(sheet_index)
131            .ok_or_else(|| {
132                ExcelError::ReadError(format!(
133                    "Sheet index {} out of bounds. Available: {} sheets",
134                    sheet_index,
135                    self.sheet_names.len()
136                ))
137            })?
138            .clone();
139
140        self.rows(&sheet_name)
141    }
142
143    /// Get worksheet dimensions (rows, columns) - for backward compatibility
144    ///
145    /// # Note
146    /// This is a simplified implementation that reads all rows to count them.
147    /// Returns (row_count, max_column_count).
148    /// For large files, this can be slow as it needs to iterate through all rows.
149    pub fn dimensions(&mut self, sheet_name: &str) -> Result<(usize, usize)> {
150        let mut row_count = 0;
151        let mut max_cols = 0;
152
153        for row_result in self.rows(sheet_name)? {
154            let row = row_result?;
155            row_count += 1;
156            max_cols = max_cols.max(row.cells.len());
157        }
158
159        Ok((row_count, max_cols))
160    }
161
162    /// Stream rows from a worksheet
163    ///
164    /// # Memory Usage
165    ///
166    /// - Loads worksheet XML fully from ZIP (uncompressed)
167    /// - Processes rows with iterator (appears as streaming)
168    /// - Memory = SST + Full worksheet XML
169    ///
170    /// # Performance
171    ///
172    /// - Returns iterator for row-by-row processing
173    /// - Fast iteration: 60K-85K rows/sec
174    /// - No style/format overhead
175    ///
176    /// # Example
177    /// - Does NOT load entire worksheet into memory
178    /// - SST already loaded in `open()`
179    ///
180    /// # Example
181    ///
182    /// ```no_run
183    /// use excelstream::streaming_reader::StreamingReader;
184    ///
185    /// let mut reader = StreamingReader::open("large.xlsx")?;
186    /// for row in reader.stream_rows("Sheet1")? {
187    ///     let row = row?;
188    ///     println!("Row: {:?}", row);
189    /// }
190    /// # Ok::<(), Box<dyn std::error::Error>>(())
191    /// ```
192    pub fn stream_rows(&mut self, sheet_name: &str) -> Result<RowIterator<'_>> {
193        // Find sheet path by name
194        let sheet_path = self
195            .sheet_names
196            .iter()
197            .position(|name| name == sheet_name)
198            .and_then(|idx| self.sheet_paths.get(idx))
199            .ok_or_else(|| {
200                ExcelError::ReadError(format!(
201                    "Sheet '{}' not found. Available sheets: {:?}",
202                    sheet_name, self.sheet_names
203                ))
204            })?
205            .clone();
206
207        // Get streaming reader for worksheet XML
208        let reader = self
209            .archive
210            .read_entry_streaming_by_name(&sheet_path)
211            .map_err(|e| ExcelError::ReadError(format!("Failed to open sheet: {}", e)))?;
212
213        Ok(RowIterator {
214            reader: BufReader::with_capacity(64 * 1024, reader), // 64KB buffer
215            sst: &self.sst,
216            buffer: String::with_capacity(128 * 1024), // 128KB for XML parsing
217            in_row: false,
218            row_content: String::with_capacity(8 * 1024), // 8KB per row
219        })
220    }
221
222    /// Alias for `stream_rows()` for backward compatibility
223    ///
224    /// This method provides the same functionality as `stream_rows()` but uses
225    /// the more familiar `rows()` name that matches the old calamine-based API.
226    /// Returns an iterator of `Row` structs for full API compatibility.
227    ///
228    /// # Example
229    ///
230    /// ```no_run
231    /// use excelstream::ExcelReader;
232    ///
233    /// let mut reader = ExcelReader::open("large.xlsx")?;
234    /// for row_result in reader.rows("Sheet1")? {
235    ///     let row = row_result?;
236    ///     println!("Row {}: {:?}", row.index, row.to_strings());
237    /// }
238    /// # Ok::<(), Box<dyn std::error::Error>>(())
239    /// ```
240    pub fn rows(&mut self, sheet_name: &str) -> Result<RowStructIterator<'_>> {
241        let inner = self.stream_rows(sheet_name)?;
242        Ok(RowStructIterator {
243            inner,
244            row_index: 0,
245        })
246    }
247}
248
249// Decode XML entities (&lt; &gt; &amp; &quot; &apos;)
250fn decode_xml_entities(text: &str) -> String {
251    text.replace("&lt;", "<")
252        .replace("&gt;", ">")
253        .replace("&amp;", "&")
254        .replace("&quot;", "\"")
255        .replace("&apos;", "'")
256}
257
258impl StreamingReader {
259    /// Load Shared Strings Table
260    ///
261    /// This MUST be loaded fully because cells reference strings by index.
262    /// For files with millions of unique strings, this can still be large.
263    fn load_shared_strings(archive: &mut StreamingZipReader) -> Result<Vec<String>> {
264        let mut sst = Vec::new();
265
266        // Try to find sharedStrings.xml
267        let xml_data = match archive.read_entry_by_name("xl/sharedStrings.xml") {
268            Ok(data) => String::from_utf8_lossy(&data).to_string(),
269            Err(_) => return Ok(sst), // No SST = all cells are inline
270        };
271
272        // Parse all <si> tags (multiple per line in compact XML)
273        let mut pos = 0;
274        while let Some(si_start) = xml_data[pos..].find("<si>") {
275            let si_start = pos + si_start;
276            if let Some(si_end) = xml_data[si_start..].find("</si>") {
277                let si_end = si_start + si_end + 5; // Include "</si>"
278                let si_block = &xml_data[si_start..si_end];
279
280                // Extract text from <t>text</t>
281                if let Some(t_start) = si_block.find("<t>") {
282                    if let Some(t_end) = si_block.find("</t>") {
283                        let text = &si_block[t_start + 3..t_end];
284                        // Decode XML entities in SST
285                        let decoded = decode_xml_entities(text);
286                        sst.push(decoded);
287                    }
288                }
289
290                pos = si_end;
291            } else {
292                break;
293            }
294        }
295
296        Ok(sst)
297    }
298
299    /// Load sheet names and paths from workbook.xml
300    ///
301    /// Parses workbook.xml to get sheet names and their corresponding worksheet paths.
302    /// Supports Unicode sheet names.
303    fn load_sheet_info(archive: &mut StreamingZipReader) -> Result<(Vec<String>, Vec<String>)> {
304        let mut sheet_names = Vec::new();
305        let mut sheet_ids = Vec::new();
306
307        // Load workbook.xml
308        let xml_data = archive
309            .read_entry_by_name("xl/workbook.xml")
310            .map_err(|e| ExcelError::ReadError(format!("Failed to open workbook.xml: {}", e)))?;
311        let xml_data = String::from_utf8_lossy(&xml_data).to_string();
312
313        // Parse <sheet> tags to get names and rIds
314        // Example: <sheet name="Sheet1" sheetId="1" r:id="rId1"/>
315        let mut pos = 0;
316        while let Some(sheet_start) = xml_data[pos..].find("<sheet ") {
317            let sheet_start = pos + sheet_start;
318            if let Some(sheet_end) = xml_data[sheet_start..].find("/>") {
319                let sheet_end = sheet_start + sheet_end + 2;
320                let sheet_tag = &xml_data[sheet_start..sheet_end];
321
322                // Extract name attribute
323                if let Some(name_start) = sheet_tag.find("name=\"") {
324                    let name_start = name_start + 6;
325                    if let Some(name_end) = sheet_tag[name_start..].find("\"") {
326                        let name = &sheet_tag[name_start..name_start + name_end];
327                        sheet_names.push(name.to_string());
328                    }
329                }
330
331                // Extract r:id attribute
332                if let Some(rid_start) = sheet_tag.find("r:id=\"") {
333                    let rid_start = rid_start + 6;
334                    if let Some(rid_end) = sheet_tag[rid_start..].find("\"") {
335                        let rid = &sheet_tag[rid_start..rid_start + rid_end];
336                        sheet_ids.push(rid.to_string());
337                    }
338                }
339
340                pos = sheet_end;
341            } else {
342                break;
343            }
344        }
345        // Now load workbook.xml.rels to map rIds to worksheet paths
346        let mut sheet_paths = Vec::new();
347
348        let rels_data = archive
349            .read_entry_by_name("xl/_rels/workbook.xml.rels")
350            .map_err(|e| {
351                ExcelError::ReadError(format!("Failed to open workbook.xml.rels: {}", e))
352            })?;
353        let rels_data = String::from_utf8_lossy(&rels_data).to_string();
354
355        // Map rIds to worksheet paths
356        for rid in &sheet_ids {
357            // Find <Relationship Id="rId1" Target="worksheets/sheet1.xml"/>
358            if let Some(rel_start) = rels_data.find(&format!("Id=\"{}\"", rid)) {
359                // Find the start of this Relationship tag
360                let tag_start = rels_data[..rel_start]
361                    .rfind("<Relationship")
362                    .unwrap_or(rel_start.saturating_sub(100));
363
364                // Find the end of this Relationship tag
365                let tag_end = if let Some(end_pos) = rels_data[rel_start..].find("/>") {
366                    rel_start + end_pos + 2
367                } else {
368                    rels_data.len()
369                };
370
371                let rel_tag = &rels_data[tag_start..tag_end];
372
373                // Extract Target from this specific tag
374                if let Some(target_start) = rel_tag.find("Target=\"") {
375                    let target_start = target_start + 8;
376                    if let Some(target_end) = rel_tag[target_start..].find("\"") {
377                        let target = &rel_tag[target_start..target_start + target_end];
378                        // Target is relative to xl/, e.g., "worksheets/sheet1.xml"
379                        let full_path = format!("xl/{}", target);
380                        sheet_paths.push(full_path);
381                    }
382                }
383            }
384        }
385
386        if sheet_names.len() != sheet_paths.len() {
387            return Err(ExcelError::ReadError(format!(
388                "Mismatch between sheet names ({}) and paths ({})",
389                sheet_names.len(),
390                sheet_paths.len()
391            )));
392        }
393
394        Ok((sheet_names, sheet_paths))
395    }
396
397    fn estimate_sst_size(sst: &[String]) -> usize {
398        sst.iter().map(|s| s.len() + 24).sum() // 24 bytes per String overhead
399    }
400}
401
402/// Iterator over rows in a worksheet
403/// Streams XML data from ZIP without loading entire worksheet into memory
404pub struct RowIterator<'a> {
405    reader: BufReader<Box<dyn Read + 'a>>,
406    sst: &'a [String],
407    buffer: String,      // Buffer for reading XML chunks
408    in_row: bool,        // Whether we're currently inside a <row> tag
409    row_content: String, // Buffer for accumulating current row XML
410}
411
412impl<'a> Iterator for RowIterator<'a> {
413    type Item = Result<Vec<String>>;
414
415    fn next(&mut self) -> Option<Self::Item> {
416        loop {
417            // Try to find complete <row>...</row> in current buffer
418            if let Some(row) = self.try_extract_row() {
419                return Some(Ok(row));
420            }
421
422            // Need more data - read next chunk
423            let mut chunk = vec![0u8; 32 * 1024]; // 32KB chunks
424            match self.reader.read(&mut chunk) {
425                Ok(0) => {
426                    // EOF reached
427                    if !self.row_content.is_empty() {
428                        // Parse any remaining incomplete row
429                        if let Ok(row) = Self::parse_row(&self.row_content, self.sst) {
430                            self.row_content.clear();
431                            return Some(Ok(row));
432                        }
433                    }
434                    return None;
435                }
436                Ok(n) => {
437                    // Append new data to buffer
438                    if let Ok(s) = std::str::from_utf8(&chunk[..n]) {
439                        self.buffer.push_str(s);
440                    }
441                }
442                Err(e) => {
443                    return Some(Err(ExcelError::ReadError(format!(
444                        "Failed to read XML: {}",
445                        e
446                    ))));
447                }
448            }
449        }
450    }
451}
452
453impl<'a> RowIterator<'a> {
454    /// Try to extract a complete row from the buffer
455    fn try_extract_row(&mut self) -> Option<Vec<String>> {
456        loop {
457            // Look for <row> start
458            if !self.in_row {
459                if let Some(row_start) = self.buffer.find("<row ") {
460                    self.in_row = true;
461                    // Move from <row onwards to row_content, keep rest in buffer
462                    self.row_content.push_str(&self.buffer[row_start..]);
463                    self.buffer.drain(..);
464                } else {
465                    // No <row found, discard old data but keep some for potential partial tag
466                    if self.buffer.len() > 1024 {
467                        self.buffer.drain(..self.buffer.len() - 100);
468                    }
469                    return None;
470                }
471            }
472
473            // If in row, look for </row> end
474            if self.in_row {
475                // Check row_content first
476                if let Some(row_end_pos) = self.row_content.find("</row>") {
477                    // Found complete row in row_content
478                    let row_end = row_end_pos + 6; // Include "</row>"
479                    let row_xml = self.row_content[..row_end].to_string();
480
481                    // Move remaining data back to buffer for next iteration
482                    if row_end < self.row_content.len() {
483                        self.buffer.insert_str(0, &self.row_content[row_end..]);
484                    }
485
486                    // Clear and reset
487                    self.row_content.clear();
488                    self.in_row = false;
489
490                    // Parse and return
491                    if let Ok(row) = Self::parse_row(&row_xml, self.sst) {
492                        return Some(row);
493                    }
494                    // If parse fails, continue to next row
495                    continue;
496                }
497
498                // Not in row_content, check buffer
499                if !self.buffer.is_empty() {
500                    // Append buffer to row_content
501                    self.row_content.push_str(&self.buffer);
502                    self.buffer.clear();
503                    continue; // Try again
504                }
505
506                // Need more data
507                return None;
508            }
509        }
510    }
511
512    fn parse_row(row_xml: &str, sst: &[String]) -> Result<Vec<String>> {
513        let mut row_data = Vec::new();
514        let mut pos = 0;
515
516        while let Some(cell_start) = row_xml[pos..]
517            .find("<c ")
518            .or_else(|| row_xml[pos..].find("<c>"))
519        {
520            let cell_start = pos + cell_start;
521
522            // Handle both self-closing <c ... /> and <c ...></c>
523            let (cell_end, cell_xml) =
524                if let Some(self_close_pos) = row_xml[cell_start..].find("/>") {
525                    let end = cell_start + self_close_pos + 2;
526                    let xml = &row_xml[cell_start..end];
527                    (end, xml)
528                } else if let Some(close_tag_pos) = row_xml[cell_start..].find("</c>") {
529                    let end = cell_start + close_tag_pos + 4;
530                    let xml = &row_xml[cell_start..end];
531                    (end, xml)
532                } else {
533                    break; // Incomplete cell tag
534                };
535
536            // Extract cell reference (e.g., "A1", "B1", "AA1")
537            let col_idx = if let Some(r_start) = cell_xml.find("r=\"") {
538                let r_start = r_start + 3;
539                if let Some(r_end) = cell_xml[r_start..].find("\"") {
540                    let cell_ref = &cell_xml[r_start..r_start + r_end];
541                    parse_column_index(cell_ref)
542                } else {
543                    row_data.len()
544                }
545            } else {
546                row_data.len()
547            };
548
549            // Fill empty cells between last column and current column
550            while row_data.len() < col_idx {
551                row_data.push(String::new());
552            }
553
554            // Determine cell type
555            let is_shared_string = cell_xml.contains("t=\"s\"");
556            let is_inline_str = cell_xml.contains("t=\"inlineStr\"");
557
558            // Extract value
559            let value = if is_inline_str {
560                // Inline string - look for <is><t>...</t></is>
561                if let Some(t_start) = cell_xml.find("<t>") {
562                    if let Some(t_end) = cell_xml[t_start..].find("</t>") {
563                        cell_xml[t_start + 3..t_start + t_end].to_string()
564                    } else {
565                        String::new()
566                    }
567                } else {
568                    String::new()
569                }
570            } else if let Some(v_start) = cell_xml.find("<v>") {
571                if let Some(v_end) = cell_xml[v_start..].find("</v>") {
572                    let val_str = &cell_xml[v_start + 3..v_start + v_end];
573
574                    if is_shared_string {
575                        // Lookup in SST
576                        if let Ok(idx) = val_str.parse::<usize>() {
577                            sst.get(idx).cloned().unwrap_or_default()
578                        } else {
579                            String::new()
580                        }
581                    } else {
582                        val_str.to_string()
583                    }
584                } else {
585                    String::new()
586                }
587            } else {
588                String::new()
589            };
590
591            // Decode XML entities
592            let value = decode_xml_entities(&value);
593
594            row_data.push(value);
595            pos = cell_end;
596        }
597
598        Ok(row_data)
599    }
600}
601
602// Parse column index from cell reference (e.g., "A1" -> 0, "B1" -> 1, "AA1" -> 26)
603fn parse_column_index(cell_ref: &str) -> usize {
604    let mut col_idx = 0usize;
605    for ch in cell_ref.chars() {
606        if ch.is_ascii_alphabetic() {
607            col_idx = col_idx * 26 + (ch.to_ascii_uppercase() as usize - 'A' as usize + 1);
608        } else {
609            break;
610        }
611    }
612    col_idx.saturating_sub(1) // Convert to 0-based index
613}
614
615/// Iterator wrapper that returns Row structs instead of Vec<String>
616/// for backward compatibility with the old calamine-based API
617pub struct RowStructIterator<'a> {
618    inner: RowIterator<'a>,
619    row_index: u32,
620}
621
622impl<'a> Iterator for RowStructIterator<'a> {
623    type Item = Result<Row>;
624
625    fn next(&mut self) -> Option<Self::Item> {
626        match self.inner.next()? {
627            Ok(strings) => {
628                let cells: Vec<CellValue> = strings.into_iter().map(CellValue::String).collect();
629
630                let row = Row::new(self.row_index, cells);
631                self.row_index += 1;
632                Some(Ok(row))
633            }
634            Err(e) => Some(Err(e)),
635        }
636    }
637}
638
639#[cfg(test)]
640mod tests {
641    use super::*;
642
643    #[test]
644    fn test_estimate_sst_size() {
645        let sst = vec!["hello".to_string(), "world".to_string()];
646        let size = StreamingReader::estimate_sst_size(&sst);
647        assert!(size > 10); // At least the string bytes
648    }
649}