excelstream/streaming_reader.rs
1//! Streaming reader for XLSX files with optimized memory usage
2//!
3//! This module provides a reader that processes data row-by-row with an iterator interface.
4//!
5//! **Memory Usage:**
6//! - Shared Strings Table (SST): Loaded fully (~3-5 MB for typical files)
7//! - Worksheet XML: Loaded fully from ZIP (uncompressed size)
8//! - Total memory ≈ SST + Uncompressed XML size
9//!
10//! **Important Notes:**
11//! - XLSX files are compressed. A 86 MB file may contain 1.2 GB uncompressed XML
12//! - For small-medium files (< 100 MB): Memory usage is reasonable
13//! - For large files with huge XML: Memory = uncompressed XML size
14//! - Still faster than calamine (no style parsing) and uses optimized SST
15//!
16//! **Trade-offs:**
17//! - Only supports simple XLSX files (no complex formatting)
18//! - Sequential read only (can't jump to random rows)
19//! - Best for: Fast iteration, simple data extraction, no formatting needs
20
21use crate::error::{ExcelError, Result};
22use crate::fast_writer::StreamingZipReader;
23use crate::types::{CellValue, Row};
24use std::io::{BufReader, Read};
25use std::path::Path;
26
27/// Streaming reader for XLSX files
28///
29/// **Memory Usage:**
30/// - SST (Shared Strings): Loaded fully (typically 3-5 MB)
31/// - Worksheet XML: Loaded from ZIP (uncompressed size)
32/// - Total ≈ SST + Uncompressed XML size
33///
34/// **Performance:**
35/// - 60K-85K rows/sec depending on file size
36/// - Faster than calamine (no style/format parsing)
37/// - Optimized hybrid SST
38///
39/// **Best for:**
40/// - Small to medium files (< 100 MB compressed)
41/// - Files with small SST but many rows
42/// - Simple data extraction without formatting
43pub struct StreamingReader {
44 archive: StreamingZipReader,
45 sst: Vec<String>,
46 sheet_names: Vec<String>,
47 sheet_paths: Vec<String>,
48}
49
50impl StreamingReader {
51 /// Open XLSX file for streaming read
52 ///
53 /// # Memory Usage
54 ///
55 /// - Loads SST (Shared Strings Table) fully into memory
56 /// - Worksheet data loaded as single XML string (uncompressed size)
57 /// - For 86 MB file: May use ~1.2 GB if XML is large
58 /// - For smaller files (< 50 MB): Usually reasonable memory
59 ///
60 /// # Performance
61 ///
62 /// - Fast: 60K-85K rows/sec
63 /// - No style/format parsing overhead
64 /// - Optimized for simple data extraction
65 ///
66 /// # Example
67 ///
68 /// ```no_run
69 /// use excelstream::streaming_reader::StreamingReader;
70 ///
71 /// let reader = StreamingReader::open("large.xlsx")?;
72 /// // SST loaded, ready to stream rows
73 /// # Ok::<(), Box<dyn std::error::Error>>(())
74 /// ```
75 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
76 let mut archive = StreamingZipReader::open(path)
77 .map_err(|e| ExcelError::ReadError(format!("Failed to open ZIP: {}", e)))?;
78
79 // Load Shared Strings Table (can't avoid this)
80 let sst = Self::load_shared_strings(&mut archive)?;
81
82 println!(
83 "📊 Loaded {} shared strings (~{:.2} MB in memory)",
84 sst.len(),
85 Self::estimate_sst_size(&sst) as f64 / (1024.0 * 1024.0)
86 );
87
88 // Load sheet names and paths from workbook.xml
89 let (sheet_names, sheet_paths) = Self::load_sheet_info(&mut archive)?;
90
91 println!("📋 Found {} sheets: {:?}", sheet_names.len(), sheet_names);
92
93 Ok(StreamingReader {
94 archive,
95 sst,
96 sheet_names,
97 sheet_paths,
98 })
99 }
100
101 /// Get list of sheet names
102 ///
103 /// Returns the names of all worksheets in the workbook.
104 ///
105 /// # Example
106 ///
107 /// ```no_run
108 /// use excelstream::ExcelReader;
109 ///
110 /// let reader = ExcelReader::open("workbook.xlsx")?;
111 /// for sheet_name in reader.sheet_names() {
112 /// println!("Sheet: {}", sheet_name);
113 /// }
114 /// # Ok::<(), Box<dyn std::error::Error>>(())
115 /// ```
116 pub fn sheet_names(&self) -> Vec<String> {
117 self.sheet_names.clone()
118 }
119
120 /// Read rows by sheet index (for backward compatibility)
121 ///
122 /// # Arguments
123 /// * `sheet_index` - Zero-based sheet index (0 = first sheet)
124 ///
125 /// # Returns
126 /// Iterator of Row structs
127 pub fn rows_by_index(&mut self, sheet_index: usize) -> Result<RowStructIterator<'_>> {
128 let sheet_name = self
129 .sheet_names
130 .get(sheet_index)
131 .ok_or_else(|| {
132 ExcelError::ReadError(format!(
133 "Sheet index {} out of bounds. Available: {} sheets",
134 sheet_index,
135 self.sheet_names.len()
136 ))
137 })?
138 .clone();
139
140 self.rows(&sheet_name)
141 }
142
143 /// Get worksheet dimensions (rows, columns) - for backward compatibility
144 ///
145 /// # Note
146 /// This is a simplified implementation that reads all rows to count them.
147 /// Returns (row_count, max_column_count).
148 /// For large files, this can be slow as it needs to iterate through all rows.
149 pub fn dimensions(&mut self, sheet_name: &str) -> Result<(usize, usize)> {
150 let mut row_count = 0;
151 let mut max_cols = 0;
152
153 for row_result in self.rows(sheet_name)? {
154 let row = row_result?;
155 row_count += 1;
156 max_cols = max_cols.max(row.cells.len());
157 }
158
159 Ok((row_count, max_cols))
160 }
161
162 /// Stream rows from a worksheet
163 ///
164 /// # Memory Usage
165 ///
166 /// - Loads worksheet XML fully from ZIP (uncompressed)
167 /// - Processes rows with iterator (appears as streaming)
168 /// - Memory = SST + Full worksheet XML
169 ///
170 /// # Performance
171 ///
172 /// - Returns iterator for row-by-row processing
173 /// - Fast iteration: 60K-85K rows/sec
174 /// - No style/format overhead
175 ///
176 /// # Example
177 /// - Does NOT load entire worksheet into memory
178 /// - SST already loaded in `open()`
179 ///
180 /// # Example
181 ///
182 /// ```no_run
183 /// use excelstream::streaming_reader::StreamingReader;
184 ///
185 /// let mut reader = StreamingReader::open("large.xlsx")?;
186 /// for row in reader.stream_rows("Sheet1")? {
187 /// let row = row?;
188 /// println!("Row: {:?}", row);
189 /// }
190 /// # Ok::<(), Box<dyn std::error::Error>>(())
191 /// ```
192 pub fn stream_rows(&mut self, sheet_name: &str) -> Result<RowIterator<'_>> {
193 // Find sheet path by name
194 let sheet_path = self
195 .sheet_names
196 .iter()
197 .position(|name| name == sheet_name)
198 .and_then(|idx| self.sheet_paths.get(idx))
199 .ok_or_else(|| {
200 ExcelError::ReadError(format!(
201 "Sheet '{}' not found. Available sheets: {:?}",
202 sheet_name, self.sheet_names
203 ))
204 })?
205 .clone();
206
207 // Get streaming reader for worksheet XML
208 let reader = self
209 .archive
210 .read_entry_streaming_by_name(&sheet_path)
211 .map_err(|e| ExcelError::ReadError(format!("Failed to open sheet: {}", e)))?;
212
213 Ok(RowIterator {
214 reader: BufReader::with_capacity(64 * 1024, reader), // 64KB buffer
215 sst: &self.sst,
216 buffer: String::with_capacity(128 * 1024), // 128KB for XML parsing
217 in_row: false,
218 row_content: String::with_capacity(8 * 1024), // 8KB per row
219 })
220 }
221
222 /// Alias for `stream_rows()` for backward compatibility
223 ///
224 /// This method provides the same functionality as `stream_rows()` but uses
225 /// the more familiar `rows()` name that matches the old calamine-based API.
226 /// Returns an iterator of `Row` structs for full API compatibility.
227 ///
228 /// # Example
229 ///
230 /// ```no_run
231 /// use excelstream::ExcelReader;
232 ///
233 /// let mut reader = ExcelReader::open("large.xlsx")?;
234 /// for row_result in reader.rows("Sheet1")? {
235 /// let row = row_result?;
236 /// println!("Row {}: {:?}", row.index, row.to_strings());
237 /// }
238 /// # Ok::<(), Box<dyn std::error::Error>>(())
239 /// ```
240 pub fn rows(&mut self, sheet_name: &str) -> Result<RowStructIterator<'_>> {
241 let inner = self.stream_rows(sheet_name)?;
242 Ok(RowStructIterator {
243 inner,
244 row_index: 0,
245 })
246 }
247}
248
249// Decode XML entities (< > & " ')
250fn decode_xml_entities(text: &str) -> String {
251 text.replace("<", "<")
252 .replace(">", ">")
253 .replace("&", "&")
254 .replace(""", "\"")
255 .replace("'", "'")
256}
257
258impl StreamingReader {
259 /// Load Shared Strings Table
260 ///
261 /// This MUST be loaded fully because cells reference strings by index.
262 /// For files with millions of unique strings, this can still be large.
263 fn load_shared_strings(archive: &mut StreamingZipReader) -> Result<Vec<String>> {
264 let mut sst = Vec::new();
265
266 // Try to find sharedStrings.xml
267 let xml_data = match archive.read_entry_by_name("xl/sharedStrings.xml") {
268 Ok(data) => String::from_utf8_lossy(&data).to_string(),
269 Err(_) => return Ok(sst), // No SST = all cells are inline
270 };
271
272 // Parse all <si> tags (multiple per line in compact XML)
273 let mut pos = 0;
274 while let Some(si_start) = xml_data[pos..].find("<si>") {
275 let si_start = pos + si_start;
276 if let Some(si_end) = xml_data[si_start..].find("</si>") {
277 let si_end = si_start + si_end + 5; // Include "</si>"
278 let si_block = &xml_data[si_start..si_end];
279
280 // Extract text from <t>text</t>
281 if let Some(t_start) = si_block.find("<t>") {
282 if let Some(t_end) = si_block.find("</t>") {
283 let text = &si_block[t_start + 3..t_end];
284 // Decode XML entities in SST
285 let decoded = decode_xml_entities(text);
286 sst.push(decoded);
287 }
288 }
289
290 pos = si_end;
291 } else {
292 break;
293 }
294 }
295
296 Ok(sst)
297 }
298
299 /// Load sheet names and paths from workbook.xml
300 ///
301 /// Parses workbook.xml to get sheet names and their corresponding worksheet paths.
302 /// Supports Unicode sheet names.
303 fn load_sheet_info(archive: &mut StreamingZipReader) -> Result<(Vec<String>, Vec<String>)> {
304 let mut sheet_names = Vec::new();
305 let mut sheet_ids = Vec::new();
306
307 // Load workbook.xml
308 let xml_data = archive
309 .read_entry_by_name("xl/workbook.xml")
310 .map_err(|e| ExcelError::ReadError(format!("Failed to open workbook.xml: {}", e)))?;
311 let xml_data = String::from_utf8_lossy(&xml_data).to_string();
312
313 // Parse <sheet> tags to get names and rIds
314 // Example: <sheet name="Sheet1" sheetId="1" r:id="rId1"/>
315 let mut pos = 0;
316 while let Some(sheet_start) = xml_data[pos..].find("<sheet ") {
317 let sheet_start = pos + sheet_start;
318 if let Some(sheet_end) = xml_data[sheet_start..].find("/>") {
319 let sheet_end = sheet_start + sheet_end + 2;
320 let sheet_tag = &xml_data[sheet_start..sheet_end];
321
322 // Extract name attribute
323 if let Some(name_start) = sheet_tag.find("name=\"") {
324 let name_start = name_start + 6;
325 if let Some(name_end) = sheet_tag[name_start..].find("\"") {
326 let name = &sheet_tag[name_start..name_start + name_end];
327 sheet_names.push(name.to_string());
328 }
329 }
330
331 // Extract r:id attribute
332 if let Some(rid_start) = sheet_tag.find("r:id=\"") {
333 let rid_start = rid_start + 6;
334 if let Some(rid_end) = sheet_tag[rid_start..].find("\"") {
335 let rid = &sheet_tag[rid_start..rid_start + rid_end];
336 sheet_ids.push(rid.to_string());
337 }
338 }
339
340 pos = sheet_end;
341 } else {
342 break;
343 }
344 }
345 // Now load workbook.xml.rels to map rIds to worksheet paths
346 let mut sheet_paths = Vec::new();
347
348 let rels_data = archive
349 .read_entry_by_name("xl/_rels/workbook.xml.rels")
350 .map_err(|e| {
351 ExcelError::ReadError(format!("Failed to open workbook.xml.rels: {}", e))
352 })?;
353 let rels_data = String::from_utf8_lossy(&rels_data).to_string();
354
355 // Map rIds to worksheet paths
356 for rid in &sheet_ids {
357 // Find <Relationship Id="rId1" Target="worksheets/sheet1.xml"/>
358 if let Some(rel_start) = rels_data.find(&format!("Id=\"{}\"", rid)) {
359 // Find the start of this Relationship tag
360 let tag_start = rels_data[..rel_start]
361 .rfind("<Relationship")
362 .unwrap_or(rel_start.saturating_sub(100));
363
364 // Find the end of this Relationship tag
365 let tag_end = if let Some(end_pos) = rels_data[rel_start..].find("/>") {
366 rel_start + end_pos + 2
367 } else {
368 rels_data.len()
369 };
370
371 let rel_tag = &rels_data[tag_start..tag_end];
372
373 // Extract Target from this specific tag
374 if let Some(target_start) = rel_tag.find("Target=\"") {
375 let target_start = target_start + 8;
376 if let Some(target_end) = rel_tag[target_start..].find("\"") {
377 let target = &rel_tag[target_start..target_start + target_end];
378 // Target is relative to xl/, e.g., "worksheets/sheet1.xml"
379 let full_path = format!("xl/{}", target);
380 sheet_paths.push(full_path);
381 }
382 }
383 }
384 }
385
386 if sheet_names.len() != sheet_paths.len() {
387 return Err(ExcelError::ReadError(format!(
388 "Mismatch between sheet names ({}) and paths ({})",
389 sheet_names.len(),
390 sheet_paths.len()
391 )));
392 }
393
394 Ok((sheet_names, sheet_paths))
395 }
396
397 fn estimate_sst_size(sst: &[String]) -> usize {
398 sst.iter().map(|s| s.len() + 24).sum() // 24 bytes per String overhead
399 }
400}
401
402/// Iterator over rows in a worksheet
403/// Streams XML data from ZIP without loading entire worksheet into memory
404pub struct RowIterator<'a> {
405 reader: BufReader<Box<dyn Read + 'a>>,
406 sst: &'a [String],
407 buffer: String, // Buffer for reading XML chunks
408 in_row: bool, // Whether we're currently inside a <row> tag
409 row_content: String, // Buffer for accumulating current row XML
410}
411
412impl<'a> Iterator for RowIterator<'a> {
413 type Item = Result<Vec<String>>;
414
415 fn next(&mut self) -> Option<Self::Item> {
416 loop {
417 // Try to find complete <row>...</row> in current buffer
418 if let Some(row) = self.try_extract_row() {
419 return Some(Ok(row));
420 }
421
422 // Need more data - read next chunk
423 let mut chunk = vec![0u8; 32 * 1024]; // 32KB chunks
424 match self.reader.read(&mut chunk) {
425 Ok(0) => {
426 // EOF reached
427 if !self.row_content.is_empty() {
428 // Parse any remaining incomplete row
429 if let Ok(row) = Self::parse_row(&self.row_content, self.sst) {
430 self.row_content.clear();
431 return Some(Ok(row));
432 }
433 }
434 return None;
435 }
436 Ok(n) => {
437 // Append new data to buffer
438 if let Ok(s) = std::str::from_utf8(&chunk[..n]) {
439 self.buffer.push_str(s);
440 }
441 }
442 Err(e) => {
443 return Some(Err(ExcelError::ReadError(format!(
444 "Failed to read XML: {}",
445 e
446 ))));
447 }
448 }
449 }
450 }
451}
452
453impl<'a> RowIterator<'a> {
454 /// Try to extract a complete row from the buffer
455 fn try_extract_row(&mut self) -> Option<Vec<String>> {
456 loop {
457 // Look for <row> start
458 if !self.in_row {
459 if let Some(row_start) = self.buffer.find("<row ") {
460 self.in_row = true;
461 // Move from <row onwards to row_content, keep rest in buffer
462 self.row_content.push_str(&self.buffer[row_start..]);
463 self.buffer.drain(..);
464 } else {
465 // No <row found, discard old data but keep some for potential partial tag
466 if self.buffer.len() > 1024 {
467 self.buffer.drain(..self.buffer.len() - 100);
468 }
469 return None;
470 }
471 }
472
473 // If in row, look for </row> end
474 if self.in_row {
475 // Check row_content first
476 if let Some(row_end_pos) = self.row_content.find("</row>") {
477 // Found complete row in row_content
478 let row_end = row_end_pos + 6; // Include "</row>"
479 let row_xml = self.row_content[..row_end].to_string();
480
481 // Move remaining data back to buffer for next iteration
482 if row_end < self.row_content.len() {
483 self.buffer.insert_str(0, &self.row_content[row_end..]);
484 }
485
486 // Clear and reset
487 self.row_content.clear();
488 self.in_row = false;
489
490 // Parse and return
491 if let Ok(row) = Self::parse_row(&row_xml, self.sst) {
492 return Some(row);
493 }
494 // If parse fails, continue to next row
495 continue;
496 }
497
498 // Not in row_content, check buffer
499 if !self.buffer.is_empty() {
500 // Append buffer to row_content
501 self.row_content.push_str(&self.buffer);
502 self.buffer.clear();
503 continue; // Try again
504 }
505
506 // Need more data
507 return None;
508 }
509 }
510 }
511
512 fn parse_row(row_xml: &str, sst: &[String]) -> Result<Vec<String>> {
513 let mut row_data = Vec::new();
514 let mut pos = 0;
515
516 while let Some(cell_start) = row_xml[pos..]
517 .find("<c ")
518 .or_else(|| row_xml[pos..].find("<c>"))
519 {
520 let cell_start = pos + cell_start;
521
522 // Handle both self-closing <c ... /> and <c ...></c>
523 let (cell_end, cell_xml) =
524 if let Some(self_close_pos) = row_xml[cell_start..].find("/>") {
525 let end = cell_start + self_close_pos + 2;
526 let xml = &row_xml[cell_start..end];
527 (end, xml)
528 } else if let Some(close_tag_pos) = row_xml[cell_start..].find("</c>") {
529 let end = cell_start + close_tag_pos + 4;
530 let xml = &row_xml[cell_start..end];
531 (end, xml)
532 } else {
533 break; // Incomplete cell tag
534 };
535
536 // Extract cell reference (e.g., "A1", "B1", "AA1")
537 let col_idx = if let Some(r_start) = cell_xml.find("r=\"") {
538 let r_start = r_start + 3;
539 if let Some(r_end) = cell_xml[r_start..].find("\"") {
540 let cell_ref = &cell_xml[r_start..r_start + r_end];
541 parse_column_index(cell_ref)
542 } else {
543 row_data.len()
544 }
545 } else {
546 row_data.len()
547 };
548
549 // Fill empty cells between last column and current column
550 while row_data.len() < col_idx {
551 row_data.push(String::new());
552 }
553
554 // Determine cell type
555 let is_shared_string = cell_xml.contains("t=\"s\"");
556 let is_inline_str = cell_xml.contains("t=\"inlineStr\"");
557
558 // Extract value
559 let value = if is_inline_str {
560 // Inline string - look for <is><t>...</t></is>
561 if let Some(t_start) = cell_xml.find("<t>") {
562 if let Some(t_end) = cell_xml[t_start..].find("</t>") {
563 cell_xml[t_start + 3..t_start + t_end].to_string()
564 } else {
565 String::new()
566 }
567 } else {
568 String::new()
569 }
570 } else if let Some(v_start) = cell_xml.find("<v>") {
571 if let Some(v_end) = cell_xml[v_start..].find("</v>") {
572 let val_str = &cell_xml[v_start + 3..v_start + v_end];
573
574 if is_shared_string {
575 // Lookup in SST
576 if let Ok(idx) = val_str.parse::<usize>() {
577 sst.get(idx).cloned().unwrap_or_default()
578 } else {
579 String::new()
580 }
581 } else {
582 val_str.to_string()
583 }
584 } else {
585 String::new()
586 }
587 } else {
588 String::new()
589 };
590
591 // Decode XML entities
592 let value = decode_xml_entities(&value);
593
594 row_data.push(value);
595 pos = cell_end;
596 }
597
598 Ok(row_data)
599 }
600}
601
602// Parse column index from cell reference (e.g., "A1" -> 0, "B1" -> 1, "AA1" -> 26)
603fn parse_column_index(cell_ref: &str) -> usize {
604 let mut col_idx = 0usize;
605 for ch in cell_ref.chars() {
606 if ch.is_ascii_alphabetic() {
607 col_idx = col_idx * 26 + (ch.to_ascii_uppercase() as usize - 'A' as usize + 1);
608 } else {
609 break;
610 }
611 }
612 col_idx.saturating_sub(1) // Convert to 0-based index
613}
614
615/// Iterator wrapper that returns Row structs instead of Vec<String>
616/// for backward compatibility with the old calamine-based API
617pub struct RowStructIterator<'a> {
618 inner: RowIterator<'a>,
619 row_index: u32,
620}
621
622impl<'a> Iterator for RowStructIterator<'a> {
623 type Item = Result<Row>;
624
625 fn next(&mut self) -> Option<Self::Item> {
626 match self.inner.next()? {
627 Ok(strings) => {
628 let cells: Vec<CellValue> = strings.into_iter().map(CellValue::String).collect();
629
630 let row = Row::new(self.row_index, cells);
631 self.row_index += 1;
632 Some(Ok(row))
633 }
634 Err(e) => Some(Err(e)),
635 }
636 }
637}
638
639#[cfg(test)]
640mod tests {
641 use super::*;
642
643 #[test]
644 fn test_estimate_sst_size() {
645 let sst = vec!["hello".to_string(), "world".to_string()];
646 let size = StreamingReader::estimate_sst_size(&sst);
647 assert!(size > 10); // At least the string bytes
648 }
649}