rulemorph 0.3.1

YAML-based declarative data transformation engine for CSV/JSON to JSON
Documentation
use std::collections::HashSet;
use std::io::Cursor;

use quick_xml::events::Event;
use quick_xml::reader::Reader as XmlReader;
use zip::ZipArchive;

use crate::error::{TransformError, TransformErrorKind};
use crate::model::ExcelInput;
use crate::normalization::NormalizationOptions;

use super::super::invalid;
use super::super::workbook::selected_worksheet_path;
use super::super::worksheet::inspect_worksheet_xml;
use super::super::xml::local_name;
use super::read_zip_text;

pub(in crate::normalization::excel) fn preflight_xlsx_package(
    bytes: &[u8],
    excel: &ExcelInput,
    options: &NormalizationOptions,
) -> Result<(), TransformError> {
    let mut archive = ZipArchive::new(Cursor::new(bytes)).map_err(|err| {
        TransformError::new(
            TransformErrorKind::InvalidInput,
            format!("invalid Excel ZIP package: {}", err),
        )
    })?;
    if archive.len() > options.max_excel_zip_entries {
        return Err(invalid("input exceeds max_excel_zip_entries"));
    }

    let mut total_uncompressed = 0usize;
    let mut content_types = None;
    let mut workbook_xml = None;
    let mut workbook_rels = None;
    let mut worksheet_count = 0usize;
    let mut seen_entry_names = HashSet::new();
    for index in 0..archive.len() {
        let mut entry = archive.by_index(index).map_err(|err| {
            TransformError::new(
                TransformErrorKind::InvalidInput,
                format!("failed to inspect Excel ZIP entry: {}", err),
            )
        })?;
        let name = entry.name().to_string();
        let lower_name = name.to_ascii_lowercase();
        if !seen_entry_names.insert(lower_name.clone()) {
            return Err(invalid("Excel ZIP entry names must be unique"));
        }
        let size =
            usize::try_from(entry.size()).map_err(|_| invalid("Excel ZIP entry too large"))?;
        if size > options.max_excel_entry_uncompressed_bytes {
            return Err(invalid("input exceeds max_excel_entry_uncompressed_bytes"));
        }
        total_uncompressed = total_uncompressed
            .checked_add(size)
            .ok_or_else(|| invalid("input exceeds max_excel_uncompressed_bytes"))?;
        if total_uncompressed > options.max_excel_uncompressed_bytes {
            return Err(invalid("input exceeds max_excel_uncompressed_bytes"));
        }
        if lower_name.ends_with("vbaproject.bin") {
            return Err(invalid("Excel macros are not supported"));
        }
        if lower_name == "[content_types].xml" {
            content_types = Some(read_zip_text(&mut entry)?);
        } else if lower_name == "xl/workbook.xml" {
            workbook_xml = Some(read_zip_text(&mut entry)?);
        } else if lower_name == "xl/_rels/workbook.xml.rels" {
            let rels = read_zip_text(&mut entry)?;
            reject_external_relationships(&rels)?;
            workbook_rels = Some(rels);
        } else if lower_name.starts_with("xl/worksheets/") && lower_name.ends_with(".xml") {
            worksheet_count = worksheet_count.saturating_add(1);
            if worksheet_count > options.max_excel_sheets {
                return Err(invalid("input exceeds max_excel_sheets"));
            }
        } else if lower_name.ends_with(".rels") {
            let rels = read_zip_text(&mut entry)?;
            reject_external_relationships(&rels)?;
        } else if lower_name == "xl/sharedstrings.xml" {
            if size > options.max_excel_shared_string_bytes {
                return Err(invalid("input exceeds max_excel_shared_string_bytes"));
            }
            let shared_strings = read_zip_text(&mut entry)?;
            if count_xml_elements(&shared_strings, b"si")? > options.max_excel_shared_strings {
                return Err(invalid("input exceeds max_excel_shared_strings"));
            }
        } else if lower_name == "xl/styles.xml" {
            let styles = read_zip_text(&mut entry)?;
            if count_xml_elements(&styles, b"xf")? > options.max_excel_styles {
                return Err(invalid("input exceeds max_excel_styles"));
            }
        }
    }

    let content_types =
        content_types.ok_or_else(|| invalid("Excel package is missing [Content_Types].xml"))?;
    let lower_content_types = content_types.to_ascii_lowercase();
    if lower_content_types.contains("macroenabled")
        || lower_content_types.contains("application/vnd.ms-excel.sheet.binary.macroenabled")
    {
        return Err(invalid("Excel macros are not supported"));
    }
    if !lower_content_types
        .contains("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml")
    {
        return Err(invalid("only .xlsx workbooks are supported"));
    }
    let workbook_xml =
        workbook_xml.ok_or_else(|| invalid("Excel package is missing workbook.xml"))?;
    let workbook_rels =
        workbook_rels.ok_or_else(|| invalid("Excel package is missing workbook relationships"))?;
    let selected_worksheet_path = selected_worksheet_path(&workbook_xml, &workbook_rels, excel)?;
    let mut selected_sheet = archive.by_name(&selected_worksheet_path).map_err(|err| {
        TransformError::new(
            TransformErrorKind::InvalidInput,
            format!("failed to inspect selected Excel worksheet: {}", err),
        )
    })?;
    let selected_sheet = read_zip_text(&mut selected_sheet)?;
    let counts = inspect_worksheet_xml(&selected_sheet, excel.formula)?;
    if counts.rows > options.max_excel_rows || counts.max_row > options.max_excel_rows {
        return Err(invalid("input exceeds max_excel_rows"));
    }
    if counts.cells > options.max_excel_cells {
        return Err(invalid("input exceeds max_excel_cells"));
    }
    let dense_cells = counts
        .max_row
        .checked_mul(counts.max_col)
        .ok_or_else(|| invalid("input exceeds max_excel_cells"))?;
    if dense_cells > options.max_excel_cells {
        return Err(invalid("input exceeds max_excel_cells"));
    }
    Ok(())
}

fn reject_external_relationships(rels: &str) -> Result<(), TransformError> {
    let lower = rels.to_ascii_lowercase();
    let compact = lower
        .chars()
        .filter(|value| !value.is_ascii_whitespace())
        .collect::<String>();
    if compact.contains("targetmode=\"external\"")
        || compact.contains("targetmode='external'")
        || compact.contains("target=\"http:")
        || compact.contains("target=\"https:")
        || compact.contains("target=\"file:")
        || compact.contains("target='http:")
        || compact.contains("target='https:")
        || compact.contains("target='file:")
    {
        return Err(invalid("Excel external relationships are not supported"));
    }
    Ok(())
}

fn count_xml_elements(xml: &str, name: &[u8]) -> Result<usize, TransformError> {
    let mut reader = XmlReader::from_str(xml);
    reader.trim_text(false);
    let mut count = 0usize;
    loop {
        match reader.read_event() {
            Ok(Event::Start(event)) | Ok(Event::Empty(event)) => {
                if local_name(event.name().as_ref()) == name {
                    count = count.saturating_add(1);
                }
            }
            Ok(Event::Eof) => break,
            Ok(_) => {}
            Err(err) => {
                return Err(TransformError::new(
                    TransformErrorKind::InvalidInput,
                    format!("failed to parse Excel XML: {}", err),
                ));
            }
        }
    }
    Ok(count)
}