#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
use std::collections::HashMap;
use std::io::{Cursor, Read};
use quick_xml::Reader as XmlReader;
use quick_xml::events::Event;
use serde::{Deserialize, Serialize};
use zip::ZipArchive;
use crate::Result;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum NumFmtKind {
#[default]
General,
Number,
Date,
Time,
DateTime,
Currency,
Percentage,
Scientific,
Text,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MergedRegion {
pub top_row: u32,
pub left_col: u32,
pub bottom_row: u32,
pub right_col: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableDefinition {
pub name: String,
pub sheet_name: String,
pub headers: Vec<String>,
pub first_row: u32,
pub last_row: u32,
pub first_col: u32,
pub last_col: u32,
}
#[derive(Debug, Clone, Default)]
pub struct OoxmlMetadata {
pub num_fmts: HashMap<u32, NumFmtKind>,
pub cell_xfs: Vec<u32>,
pub merged_regions: HashMap<String, Vec<MergedRegion>>,
pub table_defs: Vec<TableDefinition>,
}
impl OoxmlMetadata {
#[must_use]
pub fn num_fmt_for_xf(&self, xf_index: u32) -> NumFmtKind {
self.cell_xfs
.get(xf_index as usize)
.and_then(|fmt_id| self.num_fmts.get(fmt_id))
.copied()
.unwrap_or(NumFmtKind::General)
}
}
fn classify_builtin_fmt(id: u32) -> NumFmtKind {
match id {
0 => NumFmtKind::General,
1..=4 | 37..=40 => NumFmtKind::Number,
5..=8 | 41..=44 => NumFmtKind::Currency,
9 | 10 => NumFmtKind::Percentage,
11 => NumFmtKind::Scientific,
14..=17 => NumFmtKind::Date,
18..=21 => NumFmtKind::Time,
22 => NumFmtKind::DateTime,
45..=48 => NumFmtKind::Time,
49 => NumFmtKind::Text,
_ => NumFmtKind::General,
}
}
fn classify_format_code(code: &str) -> NumFmtKind {
let lower = code.to_ascii_lowercase();
let cleaned = remove_quoted_sections(&lower);
let has_date = cleaned.contains('y') || cleaned.contains('d');
let has_month = cleaned.contains('m');
let has_time = cleaned.contains('h') || cleaned.contains('s');
let has_ampm = cleaned.contains("am/pm") || cleaned.contains("a/p");
if has_date && has_time {
return NumFmtKind::DateTime;
}
if has_date {
return NumFmtKind::Date;
}
if has_time || has_ampm {
return NumFmtKind::Time;
}
if has_month && !cleaned.contains('#') && !cleaned.contains('0') {
return NumFmtKind::Date;
}
if cleaned.contains('%') {
return NumFmtKind::Percentage;
}
if cleaned.contains("e+") || cleaned.contains("e-") {
return NumFmtKind::Scientific;
}
if cleaned.contains('$')
|| cleaned.contains('\u{20ac}')
|| cleaned.contains('\u{00a3}')
|| cleaned.contains('\u{00a5}')
|| cleaned.contains("eur")
|| cleaned.contains("usd")
|| cleaned.contains("gbp")
{
return NumFmtKind::Currency;
}
if cleaned.contains('@') {
return NumFmtKind::Text;
}
if cleaned.contains('#') || cleaned.contains('0') {
return NumFmtKind::Number;
}
NumFmtKind::General
}
fn remove_quoted_sections(code: &str) -> String {
let mut result = String::with_capacity(code.len());
let mut chars = code.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '"' {
for c in chars.by_ref() {
if c == '"' {
break;
}
}
} else if ch == '\\' {
let _ = chars.next();
} else {
result.push(ch);
}
}
result
}
#[must_use]
pub fn parse_cell_ref(cell_ref: &str) -> Option<(u32, u32)> {
let mut col_str = String::new();
let mut row_str = String::new();
for ch in cell_ref.chars() {
if ch.is_ascii_alphabetic() {
col_str.push(ch.to_ascii_uppercase());
} else if ch.is_ascii_digit() {
row_str.push(ch);
}
}
if col_str.is_empty() || row_str.is_empty() {
return None;
}
let col = col_str
.chars()
.fold(0u32, |acc, c| acc * 26 + (c as u32 - b'A' as u32 + 1))
.saturating_sub(1);
let row = row_str.parse::<u32>().ok()?.saturating_sub(1);
Some((row, col))
}
#[must_use]
pub fn parse_range_ref(range_ref: &str) -> Option<((u32, u32), (u32, u32))> {
let parts: Vec<&str> = range_ref.split(':').collect();
if parts.len() != 2 {
return None;
}
let start = parse_cell_ref(parts[0])?;
let end = parse_cell_ref(parts[1])?;
Some((start, end))
}
pub fn parse_ooxml_metadata(xlsx_bytes: &[u8]) -> Result<OoxmlMetadata> {
let cursor = Cursor::new(xlsx_bytes);
let mut archive =
ZipArchive::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
reason: format!("failed to open xlsx zip: {err}").into(),
})?;
let mut metadata = OoxmlMetadata::default();
for id in 0..=49 {
let kind = classify_builtin_fmt(id);
if kind != NumFmtKind::General {
metadata.num_fmts.insert(id, kind);
}
}
if let Ok(styles_xml) = read_zip_entry(&mut archive, "xl/styles.xml") {
parse_styles_xml(&styles_xml, &mut metadata);
}
let sheet_names = collect_sheet_filenames(&mut archive);
for (sheet_name, zip_path) in &sheet_names {
if let Ok(sheet_xml) = read_zip_entry(&mut archive, zip_path) {
let regions = parse_merge_cells_xml(&sheet_xml);
if !regions.is_empty() {
metadata.merged_regions.insert(sheet_name.clone(), regions);
}
}
}
Ok(metadata)
}
fn read_zip_entry(
archive: &mut ZipArchive<Cursor<&[u8]>>,
path: &str,
) -> std::result::Result<String, ()> {
let mut file = archive.by_name(path).map_err(|_| ())?;
let mut buf = String::new();
file.read_to_string(&mut buf).map_err(|_| ())?;
Ok(buf)
}
fn collect_sheet_filenames(archive: &mut ZipArchive<Cursor<&[u8]>>) -> Vec<(String, String)> {
let mut sheets = Vec::new();
let sheet_names_from_wb = if let Ok(wb_xml) = read_zip_entry(archive, "xl/workbook.xml") {
parse_workbook_sheet_names(&wb_xml)
} else {
Vec::new()
};
for i in 0..archive.len() {
if let Ok(file) = archive.by_index(i) {
let name = file.name().to_string();
if name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml") {
let num_str = name
.trim_start_matches("xl/worksheets/sheet")
.trim_end_matches(".xml");
if let Ok(num) = num_str.parse::<usize>() {
let display_name = sheet_names_from_wb
.get(num.saturating_sub(1))
.cloned()
.unwrap_or_else(|| format!("Sheet{num}"));
sheets.push((display_name, name));
}
}
}
}
sheets
}
fn parse_workbook_sheet_names(xml: &str) -> Vec<String> {
let mut reader = XmlReader::from_str(xml);
reader.trim_text(true);
let mut names = Vec::new();
let mut buf = Vec::new();
let mut in_sheets = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e) | Event::Empty(ref e)) if e.name().as_ref() == b"sheets" => {
in_sheets = true;
}
Ok(Event::Start(ref e) | Event::Empty(ref e))
if in_sheets && e.name().as_ref() == b"sheet" =>
{
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"name" {
if let Ok(val) = attr.decode_and_unescape_value(&reader) {
names.push(val.to_string());
}
}
}
}
Ok(Event::End(ref e)) if e.name().as_ref() == b"sheets" => {
in_sheets = false;
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
names
}
fn parse_styles_xml(xml: &str, metadata: &mut OoxmlMetadata) {
let mut reader = XmlReader::from_str(xml);
reader.trim_text(true);
let mut buf = Vec::new();
let mut in_num_fmts = false;
let mut in_cell_xfs = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
let tag = e.name();
match tag.as_ref() {
b"numFmts" => in_num_fmts = true,
b"cellXfs" => in_cell_xfs = true,
b"numFmt" if in_num_fmts => {
let mut fmt_id = None;
let mut fmt_code = None;
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"numFmtId" => {
if let Ok(v) = attr.decode_and_unescape_value(&reader) {
fmt_id = v.parse::<u32>().ok();
}
}
b"formatCode" => {
if let Ok(v) = attr.decode_and_unescape_value(&reader) {
fmt_code = Some(v.to_string());
}
}
_ => {}
}
}
if let (Some(id), Some(code)) = (fmt_id, fmt_code) {
metadata.num_fmts.insert(id, classify_format_code(&code));
}
}
b"xf" if in_cell_xfs => {
let mut num_fmt_id = 0u32;
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"numFmtId" {
if let Ok(v) = attr.decode_and_unescape_value(&reader) {
num_fmt_id = v.parse::<u32>().unwrap_or(0);
}
}
}
metadata.cell_xfs.push(num_fmt_id);
}
_ => {}
}
}
Ok(Event::End(ref e)) => match e.name().as_ref() {
b"numFmts" => in_num_fmts = false,
b"cellXfs" => in_cell_xfs = false,
_ => {}
},
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
}
fn parse_merge_cells_xml(xml: &str) -> Vec<MergedRegion> {
let mut reader = XmlReader::from_str(xml);
reader.trim_text(true);
let mut buf = Vec::new();
let mut regions = Vec::new();
let mut in_merge_cells = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) if e.name().as_ref() == b"mergeCells" => {
in_merge_cells = true;
}
Ok(Event::Start(ref e) | Event::Empty(ref e))
if in_merge_cells && e.name().as_ref() == b"mergeCell" =>
{
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"ref" {
if let Ok(val) = attr.decode_and_unescape_value(&reader) {
if let Some(((tr, lc), (br, rc))) = parse_range_ref(&val) {
regions.push(MergedRegion {
top_row: tr,
left_col: lc,
bottom_row: br,
right_col: rc,
});
}
}
}
}
}
Ok(Event::End(ref e)) if e.name().as_ref() == b"mergeCells" => {
in_merge_cells = false;
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
regions
}
#[must_use]
pub fn excel_serial_to_iso(serial: f64) -> Option<String> {
if serial < 0.0 {
return None;
}
let days_from_epoch = serial.floor() as i64;
let frac = serial - serial.floor();
let base = chrono::NaiveDate::from_ymd_opt(1899, 12, 31)?;
let adjusted_days = if days_from_epoch > 60 {
days_from_epoch - 1
} else {
days_from_epoch
};
let date = base.checked_add_signed(chrono::Duration::days(adjusted_days))?;
if frac > 0.0001 {
let total_seconds = (frac * 86400.0).round() as u32;
let hours = total_seconds / 3600;
let minutes = (total_seconds % 3600) / 60;
let seconds = total_seconds % 60;
let time = chrono::NaiveTime::from_hms_opt(hours, minutes, seconds)?;
Some(
chrono::NaiveDateTime::new(date, time)
.format("%Y-%m-%d %H:%M:%S")
.to_string(),
)
} else {
Some(date.format("%Y-%m-%d").to_string())
}
}
#[must_use]
pub fn format_percentage(val: f64) -> String {
let pct = val * 100.0;
if (pct - pct.round()).abs() < 0.001 {
format!("{}%", pct.round() as i64)
} else {
format!("{pct:.1}%")
}
}
#[must_use]
pub fn format_currency(val: f64, code: &str) -> String {
let lower = code.to_ascii_lowercase();
let symbol = if lower.contains('$') || lower.contains("usd") {
"$"
} else if lower.contains('\u{20ac}') || lower.contains("eur") {
"\u{20ac}"
} else if lower.contains('\u{00a3}') || lower.contains("gbp") {
"\u{00a3}"
} else if lower.contains('\u{00a5}') || lower.contains("jpy") || lower.contains("cny") {
"\u{00a5}"
} else {
"$" };
if val < 0.0 {
format!("-{symbol}{:.2}", val.abs())
} else {
format!("{symbol}{val:.2}")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_classify_builtin_fmts() {
assert_eq!(classify_builtin_fmt(0), NumFmtKind::General);
assert_eq!(classify_builtin_fmt(1), NumFmtKind::Number);
assert_eq!(classify_builtin_fmt(5), NumFmtKind::Currency);
assert_eq!(classify_builtin_fmt(9), NumFmtKind::Percentage);
assert_eq!(classify_builtin_fmt(11), NumFmtKind::Scientific);
assert_eq!(classify_builtin_fmt(14), NumFmtKind::Date);
assert_eq!(classify_builtin_fmt(18), NumFmtKind::Time);
assert_eq!(classify_builtin_fmt(22), NumFmtKind::DateTime);
assert_eq!(classify_builtin_fmt(49), NumFmtKind::Text);
}
#[test]
fn test_classify_custom_formats() {
assert_eq!(classify_format_code("yyyy-mm-dd"), NumFmtKind::Date);
assert_eq!(classify_format_code("mm/dd/yyyy"), NumFmtKind::Date);
assert_eq!(classify_format_code("hh:mm:ss"), NumFmtKind::Time);
assert_eq!(
classify_format_code("yyyy-mm-dd hh:mm"),
NumFmtKind::DateTime
);
assert_eq!(classify_format_code("0.00%"), NumFmtKind::Percentage);
assert_eq!(classify_format_code("0.00E+00"), NumFmtKind::Scientific);
assert_eq!(classify_format_code("$#,##0.00"), NumFmtKind::Currency);
assert_eq!(
classify_format_code("\u{20ac}#,##0.00"),
NumFmtKind::Currency
);
assert_eq!(classify_format_code("#,##0.00"), NumFmtKind::Number);
assert_eq!(classify_format_code("@"), NumFmtKind::Text);
}
#[test]
fn test_parse_cell_ref() {
assert_eq!(parse_cell_ref("A1"), Some((0, 0)));
assert_eq!(parse_cell_ref("B5"), Some((4, 1)));
assert_eq!(parse_cell_ref("Z1"), Some((0, 25)));
assert_eq!(parse_cell_ref("AA1"), Some((0, 26)));
assert_eq!(parse_cell_ref("AZ100"), Some((99, 51)));
}
#[test]
fn test_parse_range_ref() {
let result = parse_range_ref("A1:D3");
assert_eq!(result, Some(((0, 0), (2, 3))));
}
#[test]
fn test_excel_serial_to_iso() {
assert_eq!(excel_serial_to_iso(1.0), Some("1900-01-01".to_string()));
assert_eq!(excel_serial_to_iso(44927.0), Some("2023-01-01".to_string()));
assert!(excel_serial_to_iso(-1.0).is_none());
}
#[test]
fn test_format_percentage() {
assert_eq!(format_percentage(0.153), "15.3%");
assert_eq!(format_percentage(0.5), "50%");
assert_eq!(format_percentage(1.0), "100%");
}
#[test]
fn test_format_currency() {
assert_eq!(format_currency(10.5, "$#,##0.00"), "$10.50");
assert_eq!(format_currency(-10.5, "$#,##0.00"), "-$10.50");
assert_eq!(format_currency(10.5, "\u{20ac}#,##0.00"), "\u{20ac}10.50");
}
#[test]
fn test_quoted_section_removal() {
assert_eq!(
remove_quoted_sections("yyyy\"year\"mm\"month\"dd\"day\""),
"yyyymmdd"
);
assert_eq!(remove_quoted_sections("#,##0.00\"$\""), "#,##0.00");
}
}