sds-converter-core 0.3.0

Convert chemical safety SDS documents (PDF/DOCX) ↔ MHLW/JIS Z 7253 standard JSON via LLM. Supports Claude, GPT, Gemini. Multilingual (ja/en/zh).
Documentation
use std::collections::HashMap;
use std::io::{Read, Write};
use std::path::Path;

use crate::error::SdsError;
use crate::schema::SdsRoot;

const MAX_TEMPLATE_BYTES: u64 = 50 * 1024 * 1024; // 50 MB
const MAX_ENTRY_BYTES: u64 = 100 * 1024 * 1024; // 100 MB per ZIP entry

/// Fill a Word (.docx) template by replacing `{{FieldName}}` placeholders with
/// values from an [`SdsRoot`]. Placeholders use the leaf field name (e.g.
/// `{{TradeNameJP}}`, `{{CompanyName}}`). Full dot-path keys are also supported
/// (e.g. `{{Identification.SupplierInformation.CompanyName}}`).
///
/// If the same leaf name appears in multiple sections the first non-empty value
/// is used; use the full path form to target a specific occurrence.
pub fn fill_template(
    sds: &SdsRoot,
    template_path: &Path,
    output_path: &Path,
) -> Result<(), SdsError> {
    let values = flatten_sds(sds)?;

    let meta = std::fs::metadata(template_path)
        .map_err(|e| SdsError::Extract(format!("template stat failed: {e}")))?;
    if meta.len() > MAX_TEMPLATE_BYTES {
        return Err(SdsError::Extract(format!(
            "template file too large ({} bytes, limit {} MB)",
            meta.len(),
            MAX_TEMPLATE_BYTES / 1024 / 1024
        )));
    }

    let template_bytes = std::fs::read(template_path)
        .map_err(|e| SdsError::Extract(format!("template open failed: {e}")))?;

    let filled = fill_docx_bytes(&template_bytes, &values)?;

    std::fs::write(output_path, &filled)
        .map_err(|e| SdsError::Extract(format!("output write failed: {e}")))?;

    Ok(())
}

// ── Internal helpers ─────────────────────────────────────────────────────────

/// Build a flat `key → value` map from SdsRoot.
/// Each leaf string/number/bool gets two entries:
///   - short key  (`TradeNameJP`)
///   - full path  (`Identification.TradeProductIdentity.TradeNameJP`)
///
/// Vec<String> values are joined with `\n`.
pub(crate) fn flatten_sds(sds: &SdsRoot) -> Result<HashMap<String, String>, SdsError> {
    let v = serde_json::to_value(sds)
        .map_err(|e| SdsError::Extract(format!("SDS serialize error: {e}")))?;
    let mut map = HashMap::new();
    flatten_value("", &v, &mut map);
    Ok(map)
}

fn flatten_value(prefix: &str, value: &serde_json::Value, map: &mut HashMap<String, String>) {
    if let serde_json::Value::Object(obj) = value {
        for (k, v) in obj {
            let full = if prefix.is_empty() {
                k.clone()
            } else {
                format!("{prefix}.{k}")
            };
            match v {
                serde_json::Value::String(s) if !s.is_empty() => {
                    map.entry(k.clone()).or_insert_with(|| s.clone());
                    map.insert(full, s.clone());
                }
                serde_json::Value::Number(n) => {
                    let s = n.to_string();
                    map.entry(k.clone()).or_insert_with(|| s.clone());
                    map.insert(full, s);
                }
                serde_json::Value::Bool(b) => {
                    let s = b.to_string();
                    map.entry(k.clone()).or_insert_with(|| s.clone());
                    map.insert(full, s);
                }
                serde_json::Value::Array(arr) => {
                    let s: Vec<&str> = arr.iter().filter_map(|v| v.as_str()).collect();
                    if !s.is_empty() {
                        let joined = s.join("\n");
                        map.entry(k.clone()).or_insert_with(|| joined.clone());
                        map.insert(full, joined);
                    }
                }
                _ => flatten_value(&full, v, map),
            }
        }
    }
}

/// Read template .docx bytes, fill placeholders, return filled .docx bytes.
fn fill_docx_bytes(
    template: &[u8],
    values: &HashMap<String, String>,
) -> Result<Vec<u8>, SdsError> {
    use std::io::Cursor;
    use zip::{ZipArchive, ZipWriter, write::SimpleFileOptions};

    let mut archive = ZipArchive::new(Cursor::new(template))
        .map_err(|e| SdsError::Extract(format!("template ZIP read failed: {e}")))?;

    // Collect all entry names first.
    let names: Vec<String> = (0..archive.len())
        .map(|i| archive.by_index(i).map(|e| e.name().to_string()))
        .collect::<Result<_, _>>()
        .map_err(|e| SdsError::Extract(format!("ZIP index failed: {e}")))?;

    let mut out = Vec::new();
    let mut writer = ZipWriter::new(Cursor::new(&mut out));

    for name in &names {
        let entry = archive
            .by_name(name)
            .map_err(|e| SdsError::Extract(format!("ZIP entry '{name}' read failed: {e}")))?;

        let compression = entry.compression();
        let options = SimpleFileOptions::default().compression_method(compression);

        writer
            .start_file(name, options)
            .map_err(|e| SdsError::Extract(format!("ZIP write '{name}' failed: {e}")))?;

        // Substitute placeholders in XML parts that may contain them.
        if is_content_xml(name) {
            let mut xml = String::new();
            entry
                .take(MAX_ENTRY_BYTES)
                .read_to_string(&mut xml)
                .map_err(|e| SdsError::Extract(format!("read '{name}' failed: {e}")))?;
            let filled = apply_substitutions(&xml, values);
            writer
                .write_all(filled.as_bytes())
                .map_err(|e| SdsError::Extract(format!("ZIP write '{name}' failed: {e}")))?;
        } else {
            let mut buf = Vec::new();
            entry
                .take(MAX_ENTRY_BYTES)
                .read_to_end(&mut buf)
                .map_err(|e| SdsError::Extract(format!("read '{name}' failed: {e}")))?;
            writer
                .write_all(&buf)
                .map_err(|e| SdsError::Extract(format!("ZIP write '{name}' failed: {e}")))?;
        }
    }

    writer
        .finish()
        .map_err(|e| SdsError::Extract(format!("ZIP finalize failed: {e}")))?;

    Ok(out)
}

/// Returns true for XML parts that may contain user-authored text with placeholders.
fn is_content_xml(name: &str) -> bool {
    name == "word/document.xml"
        || (name.starts_with("word/header") && name.ends_with(".xml"))
        || (name.starts_with("word/footer") && name.ends_with(".xml"))
}

/// Normalize split placeholders then replace `{{key}}` with values.
/// Uses a single O(doc_size) pass instead of O(keys × doc_size) repeated replacements.
fn apply_substitutions(xml: &str, values: &HashMap<String, String>) -> String {
    let normalized = normalize_split_runs(xml);
    let mut out = String::with_capacity(normalized.len());
    let mut rest = normalized.as_str();
    while let Some(start) = rest.find("{{") {
        out.push_str(&rest[..start]);
        rest = &rest[start + 2..];
        if let Some(end) = rest.find("}}") {
            let key = &rest[..end];
            if let Some(value) = values.get(key) {
                out.push_str(&escape_xml(value));
            } else {
                out.push_str("{{");
                out.push_str(key);
                out.push_str("}}");
            }
            rest = &rest[end + 2..];
        } else {
            out.push_str("{{");
        }
    }
    out.push_str(rest);
    out
}

/// Remove XML run-boundary tags that appear *inside* a `{{...}}` placeholder.
///
/// Word sometimes splits a typed word across multiple `<w:r>` runs (especially
/// after spell-check or autocorrect). This state-machine pass merges such splits
/// so that `{{Trade</w:t></w:r><w:r><w:t>NameJP}}` becomes `{{TradeNameJP}}`.
///
/// Uses slice-based output to correctly handle multi-byte UTF-8 characters
/// (e.g. Japanese text in fixed parts of the template).
fn normalize_split_runs(xml: &str) -> String {
    let mut out = String::with_capacity(xml.len());
    let bytes = xml.as_bytes();
    let n = bytes.len();
    let mut i = 0;
    let mut copy_start = 0;

    while i < n {
        if i + 1 < n && bytes[i] == b'{' && bytes[i + 1] == b'{' {
            // Flush verbatim bytes before this placeholder
            out.push_str(&xml[copy_start..i]);
            out.push_str("{{");
            i += 2;
            copy_start = i;

            loop {
                if i >= n {
                    break;
                }
                if i + 1 < n && bytes[i] == b'}' && bytes[i + 1] == b'}' {
                    // Flush placeholder text and close
                    out.push_str(&xml[copy_start..i]);
                    out.push_str("}}");
                    i += 2;
                    copy_start = i;
                    break;
                }
                if bytes[i] == b'<' {
                    // Flush placeholder text before this tag, then skip the tag
                    out.push_str(&xml[copy_start..i]);
                    while i < n && bytes[i] != b'>' {
                        i += 1;
                    }
                    if i < n {
                        i += 1; // consume '>'
                    }
                    copy_start = i;
                } else {
                    i += 1;
                }
            }
        } else {
            i += 1;
        }
    }

    // Flush remaining bytes
    out.push_str(&xml[copy_start..]);
    out
}

fn escape_xml(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&apos;")
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn normalize_no_split() {
        let xml = "<w:t>{{TradeNameJP}}</w:t>";
        assert_eq!(normalize_split_runs(xml), xml);
    }

    #[test]
    fn normalize_preserves_cjk_in_fixed_text() {
        let xml = "<w:t>製品名:{{TradeName}}</w:t>";
        assert_eq!(normalize_split_runs(xml), xml);
    }

    #[test]
    fn normalize_split_with_cjk_around_placeholder() {
        let xml = "<w:t>会社:{{Co</w:t></w:r><w:r><w:t>mp}}様</w:t>";
        assert_eq!(normalize_split_runs(xml), "<w:t>会社:{{Comp}}様</w:t>");
    }

    #[test]
    fn normalize_split_across_runs() {
        let xml = "<w:t>{{Trade</w:t></w:r><w:r><w:t>NameJP}}</w:t>";
        assert_eq!(normalize_split_runs(xml), "<w:t>{{TradeNameJP}}</w:t>");
    }

    #[test]
    fn normalize_split_with_rpr() {
        let xml = "<w:t>{{Co</w:t></w:r><w:r><w:rPr><w:b/></w:rPr><w:t>mpanyName}}</w:t>";
        assert_eq!(normalize_split_runs(xml), "<w:t>{{CompanyName}}</w:t>");
    }

    #[test]
    fn apply_substitution_basic() {
        let xml = "<w:t>{{CompanyName}}</w:t>";
        let mut values = HashMap::new();
        values.insert("CompanyName".to_string(), "ACME Corp".to_string());
        assert_eq!(apply_substitutions(xml, &values), "<w:t>ACME Corp</w:t>");
    }

    #[test]
    fn apply_substitution_xml_escape() {
        let xml = "<w:t>{{CompanyName}}</w:t>";
        let mut values = HashMap::new();
        values.insert("CompanyName".to_string(), "A & B <Ltd>".to_string());
        assert_eq!(
            apply_substitutions(xml, &values),
            "<w:t>A &amp; B &lt;Ltd&gt;</w:t>"
        );
    }

    #[test]
    fn flatten_sds_partial() {
        use crate::schema::{SdsRoot, Datasheet};
        let sds = SdsRoot {
            datasheet: Some(Datasheet {
                issue_date: Some("2024-01-01".to_string()),
                ..Default::default()
            }),
            ..Default::default()
        };
        let map = flatten_sds(&sds).unwrap();
        assert_eq!(map.get("IssueDate").map(String::as_str), Some("2024-01-01"));
        assert!(map.contains_key("Datasheet.IssueDate"));
    }
}