use crate::error::{ExcelError, Result};
use crate::fast_writer::StreamingZipReader;
use crate::types::{CellValue, Row};
use std::io::{BufReader, Read};
use std::path::Path;
fn parse_excel_date(serial: f64) -> String {
if !(1.0..=2958465.999).contains(&serial) {
return serial.to_string();
}
let date_part = serial.floor();
let time_part = serial.fract();
let days_since_1900 = if date_part >= 60.0 {
(date_part - 2.0) as i64 } else {
(date_part - 1.0) as i64
};
let mut year = 1900i64;
let mut remaining_days = days_since_1900;
let est_years = (remaining_days / 365).min(500); if est_years > 0 {
year += est_years;
let mut days_counted = 0i64;
for y in 1900..(1900 + est_years) {
days_counted += if is_leap_year(y) { 366 } else { 365 };
}
remaining_days -= days_counted;
while remaining_days < 0 {
year -= 1;
remaining_days += if is_leap_year(year) { 366 } else { 365 };
}
while remaining_days >= 365 {
let days_in_year = if is_leap_year(year) { 366 } else { 365 };
if remaining_days < days_in_year {
break;
}
remaining_days -= days_in_year;
year += 1;
}
}
const DAYS_IN_MONTHS_LEAP: [i32; 12] = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];
const DAYS_IN_MONTHS_COMMON: [i32; 12] = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];
let days_in_months = if is_leap_year(year) {
&DAYS_IN_MONTHS_LEAP
} else {
&DAYS_IN_MONTHS_COMMON
};
let mut month = 1;
let mut day = remaining_days as i32 + 1;
for (m, &days) in days_in_months.iter().enumerate() {
if day <= days {
month = m + 1;
break;
}
day -= days;
}
if time_part > 0.0001 {
let total_seconds = (time_part * 86400.0).round() as i64;
let hours = total_seconds / 3600;
let minutes = (total_seconds % 3600) / 60;
let seconds = total_seconds % 60;
format!(
"{:04}-{:02}-{:02} {:02}:{:02}:{:02}",
year, month, day, hours, minutes, seconds
)
} else {
format!("{:04}-{:02}-{:02}", year, month, day)
}
}
fn is_leap_year(year: i64) -> bool {
(year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
}
pub struct StreamingReader {
archive: StreamingZipReader,
sst: Vec<String>,
sheet_names: Vec<String>,
sheet_paths: Vec<String>,
}
impl StreamingReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let mut archive = StreamingZipReader::open(path)
.map_err(|e| ExcelError::ReadError(format!("Failed to open ZIP: {}", e)))?;
let sst = Self::load_shared_strings(&mut archive)?;
println!(
"📊 Loaded {} shared strings (~{:.2} MB in memory)",
sst.len(),
Self::estimate_sst_size(&sst) as f64 / (1024.0 * 1024.0)
);
let (sheet_names, sheet_paths) = Self::load_sheet_info(&mut archive)?;
println!("📋 Found {} sheets: {:?}", sheet_names.len(), sheet_names);
Ok(StreamingReader {
archive,
sst,
sheet_names,
sheet_paths,
})
}
pub fn sheet_names(&self) -> Vec<String> {
self.sheet_names.clone()
}
pub fn rows_by_index(&mut self, sheet_index: usize) -> Result<RowStructIterator<'_>> {
let sheet_name = self
.sheet_names
.get(sheet_index)
.ok_or_else(|| {
ExcelError::ReadError(format!(
"Sheet index {} out of bounds. Available: {} sheets",
sheet_index,
self.sheet_names.len()
))
})?
.clone();
self.rows(&sheet_name)
}
pub fn dimensions(&mut self, sheet_name: &str) -> Result<(usize, usize)> {
let mut row_count = 0;
let mut max_cols = 0;
for row_result in self.rows(sheet_name)? {
let row = row_result?;
row_count += 1;
max_cols = max_cols.max(row.cells.len());
}
Ok((row_count, max_cols))
}
pub fn stream_rows(&mut self, sheet_name: &str) -> Result<RowIterator<'_>> {
let sheet_path = self
.sheet_names
.iter()
.position(|name| name == sheet_name)
.and_then(|idx| self.sheet_paths.get(idx))
.ok_or_else(|| {
ExcelError::ReadError(format!(
"Sheet '{}' not found. Available sheets: {:?}",
sheet_name, self.sheet_names
))
})?
.clone();
let reader = self
.archive
.read_entry_streaming_by_name(&sheet_path)
.map_err(|e| ExcelError::ReadError(format!("Failed to open sheet: {}", e)))?;
Ok(RowIterator {
reader: BufReader::with_capacity(64 * 1024, reader), sst: &self.sst,
buffer: String::with_capacity(128 * 1024), pos: 0,
})
}
pub fn rows(&mut self, sheet_name: &str) -> Result<RowStructIterator<'_>> {
let inner = self.stream_rows(sheet_name)?;
Ok(RowStructIterator {
inner,
row_index: 0,
})
}
}
fn decode_xml_entities(text: &str) -> String {
text.replace("<", "<")
.replace(">", ">")
.replace("&", "&")
.replace(""", "\"")
.replace("'", "'")
}
fn parse_shared_string_item(si_block: &str) -> String {
let mut text = String::new();
let mut pos = 0;
while let Some(t_start) = si_block[pos..].find("<t") {
let t_start = pos + t_start;
let Some(t_open_end) = si_block[t_start..].find('>') else {
break;
};
let value_start = t_start + t_open_end + 1;
let Some(t_close) = si_block[value_start..].find("</t>") else {
break;
};
let value_end = value_start + t_close;
text.push_str(&decode_xml_entities(&si_block[value_start..value_end]));
pos = value_end + 4;
}
text
}
impl StreamingReader {
fn load_shared_strings(archive: &mut StreamingZipReader) -> Result<Vec<String>> {
let mut sst = Vec::new();
let xml_data = match archive.read_entry_by_name("xl/sharedStrings.xml") {
Ok(data) => String::from_utf8_lossy(&data).to_string(),
Err(_) => return Ok(sst), };
let mut pos = 0;
while let Some(si_start) = xml_data[pos..].find("<si") {
let si_start = pos + si_start;
if let Some(si_end) = xml_data[si_start..].find("</si>") {
let si_end = si_start + si_end + 5; let si_block = &xml_data[si_start..si_end];
sst.push(parse_shared_string_item(si_block));
pos = si_end;
} else {
break;
}
}
Ok(sst)
}
fn load_sheet_info(archive: &mut StreamingZipReader) -> Result<(Vec<String>, Vec<String>)> {
let mut sheet_names = Vec::new();
let mut sheet_ids = Vec::new();
let xml_data = archive
.read_entry_by_name("xl/workbook.xml")
.map_err(|e| ExcelError::ReadError(format!("Failed to open workbook.xml: {}", e)))?;
let xml_data = String::from_utf8_lossy(&xml_data).to_string();
let mut pos = 0;
while let Some(sheet_start) = xml_data[pos..].find("<sheet ") {
let sheet_start = pos + sheet_start;
if let Some(sheet_end) = xml_data[sheet_start..].find("/>") {
let sheet_end = sheet_start + sheet_end + 2;
let sheet_tag = &xml_data[sheet_start..sheet_end];
if let Some(name_start) = sheet_tag.find("name=\"") {
let name_start = name_start + 6;
if let Some(name_end) = sheet_tag[name_start..].find("\"") {
let name = &sheet_tag[name_start..name_start + name_end];
sheet_names.push(name.to_string());
}
}
if let Some(rid_start) = sheet_tag.find("r:id=\"") {
let rid_start = rid_start + 6;
if let Some(rid_end) = sheet_tag[rid_start..].find("\"") {
let rid = &sheet_tag[rid_start..rid_start + rid_end];
sheet_ids.push(rid.to_string());
}
}
pos = sheet_end;
} else {
break;
}
}
let mut sheet_paths = Vec::new();
let rels_data = archive
.read_entry_by_name("xl/_rels/workbook.xml.rels")
.map_err(|e| {
ExcelError::ReadError(format!("Failed to open workbook.xml.rels: {}", e))
})?;
let rels_data = String::from_utf8_lossy(&rels_data).to_string();
for rid in &sheet_ids {
if let Some(rel_start) = rels_data.find(&format!("Id=\"{}\"", rid)) {
let tag_start = rels_data[..rel_start]
.rfind("<Relationship")
.unwrap_or(rel_start.saturating_sub(100));
let tag_end = if let Some(end_pos) = rels_data[rel_start..].find("/>") {
rel_start + end_pos + 2
} else {
rels_data.len()
};
let rel_tag = &rels_data[tag_start..tag_end];
if let Some(target_start) = rel_tag.find("Target=\"") {
let target_start = target_start + 8;
if let Some(target_end) = rel_tag[target_start..].find("\"") {
let target = &rel_tag[target_start..target_start + target_end];
let full_path = format!("xl/{}", target);
sheet_paths.push(full_path);
}
}
}
}
if sheet_names.len() != sheet_paths.len() {
return Err(ExcelError::ReadError(format!(
"Mismatch between sheet names ({}) and paths ({})",
sheet_names.len(),
sheet_paths.len()
)));
}
Ok((sheet_names, sheet_paths))
}
fn estimate_sst_size(sst: &[String]) -> usize {
sst.iter().map(|s| s.len() + 24).sum() }
}
pub struct RowIterator<'a> {
reader: BufReader<Box<dyn Read + 'a>>,
sst: &'a [String],
buffer: String, pos: usize, }
impl<'a> Iterator for RowIterator<'a> {
type Item = Result<Vec<CellValue>>;
fn next(&mut self) -> Option<Self::Item> {
loop {
let search_slice = &self.buffer[self.pos..];
if let Some(start_idx) = search_slice.find("<row") {
let row_start = self.pos + start_idx;
if let Some(end_idx) = self.buffer[row_start..].find("</row>") {
let row_end = row_start + end_idx + 6;
let row_xml = &self.buffer[row_start..row_end];
let result = Self::parse_row(row_xml, self.sst);
self.pos = row_end;
return Some(result);
}
}
if self.pos > 0 {
if self.pos >= self.buffer.len() {
self.buffer.clear();
} else {
self.buffer.drain(..self.pos);
}
self.pos = 0;
}
let mut chunk = vec![0u8; 32 * 1024];
match self.reader.read(&mut chunk) {
Ok(0) => {
if !self.buffer.is_empty() {
self.buffer.clear();
}
return None;
}
Ok(n) => {
let s = String::from_utf8_lossy(&chunk[..n]);
self.buffer.push_str(&s);
}
Err(e) => {
return Some(Err(ExcelError::ReadError(format!(
"Failed to read XML: {}",
e
))))
}
}
}
}
}
impl<'a> RowIterator<'a> {
fn parse_row(row_xml: &str, sst: &[String]) -> Result<Vec<CellValue>> {
let mut row_data = Vec::new();
let mut pos = 0;
while let Some(cell_start) = row_xml[pos..]
.find("<c ")
.or_else(|| row_xml[pos..].find("<c>"))
{
let cell_start = pos + cell_start;
let (cell_end, cell_xml) =
if let Some(self_close_pos) = row_xml[cell_start..].find("/>") {
let end = cell_start + self_close_pos + 2;
let xml = &row_xml[cell_start..end];
(end, xml)
} else if let Some(close_tag_pos) = row_xml[cell_start..].find("</c>") {
let end = cell_start + close_tag_pos + 4;
let xml = &row_xml[cell_start..end];
(end, xml)
} else {
break; };
let col_idx = if let Some(r_start) = cell_xml.find("r=\"") {
let r_start = r_start + 3;
if let Some(r_end) = cell_xml[r_start..].find("\"") {
let cell_ref = &cell_xml[r_start..r_start + r_end];
parse_column_index(cell_ref)
} else {
row_data.len()
}
} else {
row_data.len()
};
while row_data.len() < col_idx {
row_data.push(CellValue::Empty);
}
let cell_type = if let Some(t_start) = cell_xml.find("t=\"") {
let t_start = t_start + 3;
if let Some(t_end) = cell_xml[t_start..].find("\"") {
&cell_xml[t_start..t_start + t_end]
} else {
""
}
} else {
"" };
let is_shared_string = cell_type == "s";
let is_inline_str = cell_type == "inlineStr";
let is_boolean = cell_type == "b";
let is_error = cell_type == "e";
let cell_value = if is_inline_str {
if let Some(t_start) = cell_xml.find("<t>") {
if let Some(t_end) = cell_xml[t_start..].find("</t>") {
let value = cell_xml[t_start + 3..t_start + t_end].to_string();
CellValue::String(decode_xml_entities(&value))
} else {
CellValue::Empty
}
} else {
CellValue::Empty
}
} else if let Some(v_start) = cell_xml.find("<v>") {
if let Some(v_end) = cell_xml[v_start..].find("</v>") {
let val_str = &cell_xml[v_start + 3..v_start + v_end];
if is_shared_string {
if let Ok(idx) = val_str.parse::<usize>() {
let value = sst.get(idx).cloned().unwrap_or_default();
CellValue::String(decode_xml_entities(&value))
} else {
CellValue::Empty
}
} else if is_boolean {
CellValue::Bool(val_str == "1")
} else if is_error {
CellValue::Error(val_str.to_string())
} else {
if let Ok(num) = val_str.parse::<f64>() {
let has_style = cell_xml.contains("s=\"");
if has_style && (1.0..=2958465.0).contains(&num) && num.fract() < 0.0001
{
CellValue::String(parse_excel_date(num))
} else if num.fract() == 0.0
&& (i64::MIN as f64..=i64::MAX as f64).contains(&num)
{
CellValue::Int(num as i64)
} else {
CellValue::Float(num)
}
} else {
CellValue::String(decode_xml_entities(val_str))
}
}
} else {
CellValue::Empty
}
} else {
CellValue::Empty
};
row_data.push(cell_value);
pos = cell_end;
}
Ok(row_data)
}
}
fn parse_column_index(cell_ref: &str) -> usize {
let mut col_idx = 0usize;
for ch in cell_ref.chars() {
if ch.is_ascii_alphabetic() {
col_idx = col_idx * 26 + (ch.to_ascii_uppercase() as usize - 'A' as usize + 1);
} else {
break;
}
}
col_idx.saturating_sub(1) }
pub struct RowStructIterator<'a> {
inner: RowIterator<'a>,
row_index: u32,
}
impl<'a> Iterator for RowStructIterator<'a> {
type Item = Result<Row>;
fn next(&mut self) -> Option<Self::Item> {
match self.inner.next()? {
Ok(cells) => {
let row = Row::new(self.row_index, cells);
self.row_index += 1;
Some(Ok(row))
}
Err(e) => Some(Err(e)),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_estimate_sst_size() {
let sst = vec!["hello".to_string(), "world".to_string()];
let size = StreamingReader::estimate_sst_size(&sst);
assert!(size > 10); }
#[test]
fn test_parse_shared_string_text_with_attributes() {
let xml = r#"<si><t xml:space="preserve">ID бизнес-аккаунта</t></si>"#;
assert_eq!(parse_shared_string_item(xml), "ID бизнес-аккаунта");
}
#[test]
fn test_parse_shared_string_rich_text_runs() {
let xml = r#"<si><r><t>ID </t></r><r><t>бизнес-аккаунта</t></r></si>"#;
assert_eq!(parse_shared_string_item(xml), "ID бизнес-аккаунта");
}
#[test]
fn test_parse_shared_string_preserves_empty_items() {
let xml = r#"<si></si>"#;
assert_eq!(parse_shared_string_item(xml), "");
}
#[test]
fn test_parse_shared_string_xml_entities() {
let xml = r#"<si><t>A&B <tag> "quoted" 'single'</t></si>"#;
assert_eq!(
parse_shared_string_item(xml),
"A&B <tag> \"quoted\" 'single'"
);
}
#[test]
fn test_parse_row_resolves_shared_string() {
let sst = vec!["ID бизнес-аккаунта".to_string()];
let row_xml = r#"<row r="1"><c r="A1" t="s"><v>0</v></c></row>"#;
let row = RowIterator::parse_row(row_xml, &sst).unwrap();
assert_eq!(
row,
vec![CellValue::String("ID бизнес-аккаунта".to_string())]
);
}
#[test]
fn test_parse_excel_date() {
let date = parse_excel_date(44562.0);
assert_eq!(date, "2022-01-01", "Serial 44562 should be 2022-01-01");
let date = parse_excel_date(25569.0);
assert_eq!(date, "1970-01-01", "Serial 25569 should be 1970-01-01");
let date = parse_excel_date(36526.0);
assert_eq!(date, "2000-01-01", "Serial 36526 should be 2000-01-01");
let date = parse_excel_date(44196.0);
assert_eq!(date, "2020-12-31", "Serial 44196 should be 2020-12-31");
let date = parse_excel_date(43890.0);
assert_eq!(date, "2020-02-29", "Serial 43890 should be 2020-02-29");
let date = parse_excel_date(45217.0);
assert_eq!(date, "2023-10-18", "Serial 45217 should be 2023-10-18");
}
#[test]
fn test_parse_excel_datetime() {
let datetime = parse_excel_date(44562.5);
assert_eq!(
datetime, "2022-01-01 12:00:00",
"Serial 44562.5 should be 2022-01-01 12:00:00"
);
let datetime = parse_excel_date(44562.25);
assert_eq!(
datetime, "2022-01-01 06:00:00",
"Serial 44562.25 should be 2022-01-01 06:00:00"
);
let datetime = parse_excel_date(44562.75);
assert_eq!(
datetime, "2022-01-01 18:00:00",
"Serial 44562.75 should be 2022-01-01 18:00:00"
);
let datetime = parse_excel_date(44562.0 + (14.5 / 24.0));
assert_eq!(
datetime, "2022-01-01 14:30:00",
"Serial with 14:30 should parse correctly"
);
let datetime = parse_excel_date(44562.0);
assert_eq!(
datetime, "2022-01-01",
"Serial 44562.0 should be date only (midnight)"
);
let datetime = parse_excel_date(44562.00005);
assert_eq!(
datetime, "2022-01-01",
"Serial with tiny fraction should be date only"
);
}
#[test]
fn test_is_leap_year() {
assert!(is_leap_year(2024)); assert!(!is_leap_year(2023)); assert!(!is_leap_year(1900)); assert!(is_leap_year(2000)); }
#[test]
fn test_parse_excel_date_edge_cases() {
let next_century = parse_excel_date(73051.0);
assert_eq!(next_century, "2100-01-01", "Should handle next century");
let y2k = parse_excel_date(36526.0);
assert_eq!(y2k, "2000-01-01", "Y2K transition");
let feb28_1900 = parse_excel_date(59.0); let mar1_1900 = parse_excel_date(61.0); assert_eq!(feb28_1900, "1900-02-28", "Feb 28, 1900");
assert_eq!(mar1_1900, "1900-03-01", "Mar 1, 1900");
}
}