Skip to main content

libreoffice_pure/
lib.rs

1//! High-level pure-Rust convenience helpers that mirror Clark's current
2//! `soffice --headless` usage without relying on LibreOffice itself.
3//!
4//! The Clark-focused surface in this crate is:
5//! - visual `DOCX -> PDF`
6//! - visual `PPTX -> PDF`
7//! - `DOC -> DOCX`
8//! - `XLSX` recalc with cached `<v>` patching
9//! - tracked-change acceptance for `DOCX`
10//! - generic `convert_bytes` / `convert_bytes_auto`
11//! - JSON recalc reports compatible with Clark's existing `recalc.py`
12//! - direct DOCX/PPTX page rasterization to PNG/JPEG
13//! - Markdown extraction for DOCX/PPTX/XLSX
14//! - PDF -> TXT/MD/HTML via the native PDF reader
15
16mod xlsx_eval;
17
18use std::collections::{BTreeMap, BTreeSet};
19use std::path::Path;
20
21use lo_core::{
22    parse_xml_document, serialize_xml_document, CellAddr, LoError, Result, Workbook,
23    XmlItem, XmlNode,
24};
25use lo_zip::{normalize_zip_path, rels_path_for, resolve_part_target, ZipArchive};
26use xlsx_eval::{translate_shared_formula, EvalValue, WorkbookEvaluator};
27
28/// Convert a DOCX byte stream into a PDF using Writer's native Rust
29/// layout/rendering path.
30pub fn docx_to_pdf_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
31    let doc = lo_writer::from_docx_bytes("document", bytes)?;
32    lo_writer::save_as(&doc, "pdf")
33}
34
35/// Convert a legacy binary `.doc` file (Word 97-2003) into a DOCX byte
36/// stream by extracting the piece-table text and re-emitting it.
37pub fn doc_to_docx_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
38    let doc = lo_writer::from_doc_bytes("document", bytes)?;
39    lo_writer::save_as(&doc, "docx")
40}
41
42/// Convert a PPTX byte stream into a PDF using Impress's native Rust
43/// renderer.
44pub fn pptx_to_pdf_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
45    let deck = lo_impress::from_pptx_bytes("presentation", bytes)?;
46    lo_impress::save_as(&deck, "pdf")
47}
48
49// ---------------------------------------------------------------------------
50// Generic format conversion and sniffing
51// ---------------------------------------------------------------------------
52
53#[derive(Clone, Copy, Debug, PartialEq, Eq)]
54enum Family {
55    Writer,
56    Calc,
57    Impress,
58    Draw,
59    Math,
60    Base,
61}
62
63fn canonical_format_hint(format: &str) -> String {
64    let trimmed = format.trim();
65    let trimmed = trimmed.strip_prefix('.').unwrap_or(trimmed);
66    let head = trimmed.split(':').next().unwrap_or(trimmed).trim();
67    match head.to_ascii_lowercase().as_str() {
68        "text" => "txt".to_string(),
69        "markdown" => "md".to_string(),
70        "htm" => "html".to_string(),
71        "mml" => "mathml".to_string(),
72        "odfmath" | "odf-formula" => "odf".to_string(),
73        other => other.to_string(),
74    }
75}
76
77/// Infer a format hint from a file path by looking at its extension.
78pub fn sniff_format_from_path(path: &str) -> Option<String> {
79    let ext = Path::new(path).extension()?.to_str()?;
80    Some(canonical_format_hint(ext))
81}
82
83/// Infer a format from raw bytes.
84///
85/// Covers the Clark-heavy cases:
86/// - OOXML packages (`docx`/`xlsx`/`pptx`)
87/// - ODF packages (`odt`/`ods`/`odp`)
88/// - legacy CFB files (`doc`/`xls`/`ppt`)
89/// - PDF documents (`pdf`)
90/// - plain-text-ish payloads (`txt`/`md`/`html`/`csv`/`svg`)
91pub fn sniff_format_from_bytes(bytes: &[u8]) -> Option<String> {
92    if bytes.len() >= 4 && &bytes[..4] == b"PK\x03\x04" {
93        let zip = ZipArchive::new(bytes).ok()?;
94        if zip.contains("[Content_Types].xml") {
95            let content_types = zip.read_string("[Content_Types].xml").ok()?;
96            let lower = content_types.to_ascii_lowercase();
97            if lower.contains("wordprocessingml") {
98                return Some("docx".to_string());
99            }
100            if lower.contains("spreadsheetml") {
101                return Some("xlsx".to_string());
102            }
103            if lower.contains("presentationml") {
104                return Some("pptx".to_string());
105            }
106        }
107        if zip.contains("mimetype") {
108            let mimetype = zip.read_string("mimetype").ok()?.to_ascii_lowercase();
109            if mimetype.contains("opendocument.text") {
110                return Some("odt".to_string());
111            }
112            if mimetype.contains("opendocument.spreadsheet") {
113                return Some("ods".to_string());
114            }
115            if mimetype.contains("opendocument.presentation") {
116                return Some("odp".to_string());
117            }
118        }
119    }
120    if bytes.len() >= 8 && bytes[..8] == [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1] {
121        if find_bytes(bytes, b"WordDocument") {
122            return Some("doc".to_string());
123        }
124        if find_bytes(bytes, b"Workbook") {
125            return Some("xls".to_string());
126        }
127        if find_bytes(bytes, b"PowerPoint Document") {
128            return Some("ppt".to_string());
129        }
130    }
131    let header_len = bytes.len().min(1024);
132    if bytes[..header_len].windows(5).any(|window| window == b"%PDF-") {
133        return Some("pdf".to_string());
134    }
135    let text = std::str::from_utf8(bytes).ok()?.trim_start_matches('\u{feff}');
136    if text.starts_with("<svg") || text.contains("<svg") {
137        return Some("svg".to_string());
138    }
139    if text.starts_with("<!doctype html") || text.starts_with("<html") || text.contains("<body") {
140        return Some("html".to_string());
141    }
142    if text.starts_with('#') || text.contains("\n# ") || text.contains("\n- ") {
143        return Some("md".to_string());
144    }
145    if text.contains(',') && text.lines().count() > 1 {
146        return Some("csv".to_string());
147    }
148    if !text.is_empty() {
149        return Some("txt".to_string());
150    }
151    None
152}
153
154fn find_bytes(haystack: &[u8], needle: &[u8]) -> bool {
155    haystack.windows(needle.len()).any(|window| window == needle)
156}
157
158fn family_for_source(source: &str) -> Option<Family> {
159    match canonical_format_hint(source).as_str() {
160        "txt" | "md" | "html" | "docx" | "doc" | "odt" | "pdf" => Some(Family::Writer),
161        "csv" | "xlsx" | "ods" | "xls" => Some(Family::Calc),
162        "pptx" | "odp" | "ppt" => Some(Family::Impress),
163        "svg" | "odg" => Some(Family::Draw),
164        "latex" | "mathml" | "odf" => Some(Family::Math),
165        "odb" => Some(Family::Base),
166        _ => None,
167    }
168}
169
170/// Convert a writer-format byte stream from `from` to `to`.
171pub fn writer_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
172    let from = canonical_format_hint(from);
173    let to = canonical_format_hint(to);
174    let doc = lo_writer::load_bytes("document", input, &from)?;
175    lo_writer::save_as(&doc, &to)
176}
177
178/// Convert a calc-format byte stream from `from` to `to`.
179pub fn calc_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
180    let from = canonical_format_hint(from);
181    let to = canonical_format_hint(to);
182    let workbook = lo_calc::load_bytes("workbook", input, &from)?;
183    lo_calc::save_as(&workbook, &to)
184}
185
186/// Convert an impress-format byte stream from `from` to `to`.
187pub fn impress_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
188    let from = canonical_format_hint(from);
189    let to = canonical_format_hint(to);
190    let deck = lo_impress::load_bytes("presentation", input, &from)?;
191    lo_impress::save_as(&deck, &to)
192}
193
194/// Convert a draw-format byte stream from `from` to `to`.
195pub fn draw_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
196    let from = canonical_format_hint(from);
197    let to = canonical_format_hint(to);
198    let drawing = lo_draw::load_bytes("drawing", input, &from)?;
199    lo_draw::save_as(&drawing, &to)
200}
201
202/// Convert a math-format byte stream from `from` to `to`.
203///
204/// The generic `lo_math::save_as` handles `mathml`/`svg`/`pdf`. The ODF
205/// formula package (`.odf`) lives in `lo_odf` — we route it here so the
206/// `convert --to odf` router stays consistent with every other family.
207pub fn math_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
208    let from = canonical_format_hint(from);
209    let to = canonical_format_hint(to);
210    let document = lo_math::load_bytes("formula", input, &from)?;
211    if to == "odf" {
212        return lo_odf::save_formula_document_bytes(&document);
213    }
214    lo_math::save_as(&document, &to)
215}
216
217/// Convert a base-format byte stream from `from` to `to`.
218pub fn base_convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
219    let from = canonical_format_hint(from);
220    let to = canonical_format_hint(to);
221    let database = lo_base::load_bytes("database", input, &from, None)?;
222    lo_base::save_as(&database, &to)
223}
224
225/// Convert any supported office-format byte stream from `from` to `to`.
226pub fn convert_bytes(input: &[u8], from: &str, to: &str) -> Result<Vec<u8>> {
227    let from = canonical_format_hint(from);
228    let to = canonical_format_hint(to);
229    match family_for_source(&from) {
230        Some(Family::Writer) => writer_convert_bytes(input, &from, &to),
231        Some(Family::Calc) => calc_convert_bytes(input, &from, &to),
232        Some(Family::Impress) => impress_convert_bytes(input, &from, &to),
233        Some(Family::Draw) => draw_convert_bytes(input, &from, &to),
234        Some(Family::Math) => math_convert_bytes(input, &from, &to),
235        Some(Family::Base) => base_convert_bytes(input, &from, &to),
236        None => Err(LoError::Unsupported(format!(
237            "generic conversion source format not supported: {from}"
238        ))),
239    }
240}
241
242/// Infer the source format from `path` and dispatch to [`convert_bytes`].
243pub fn convert_path_bytes(path: &str, input: &[u8], to: &str) -> Result<Vec<u8>> {
244    let from = sniff_format_from_path(path).ok_or_else(|| {
245        LoError::InvalidInput(format!("could not infer input format from path: {path}"))
246    })?;
247    convert_bytes(input, &from, to)
248}
249
250/// Infer the source format from the byte payload itself and dispatch to
251/// [`convert_bytes`].
252pub fn convert_bytes_auto(input: &[u8], to: &str) -> Result<Vec<u8>> {
253    let from = sniff_format_from_bytes(input).ok_or_else(|| {
254        LoError::InvalidInput("could not infer input format from byte stream".to_string())
255    })?;
256    convert_bytes(input, &from, to)
257}
258
259// ---- Writer shortcuts -----------------------------------------------------
260
261pub fn docx_to_html_bytes(input: &[u8]) -> Result<Vec<u8>> {
262    writer_convert_bytes(input, "docx", "html")
263}
264pub fn docx_to_txt_bytes(input: &[u8]) -> Result<Vec<u8>> {
265    writer_convert_bytes(input, "docx", "txt")
266}
267pub fn pdf_to_txt_bytes(input: &[u8]) -> Result<Vec<u8>> {
268    writer_convert_bytes(input, "pdf", "txt")
269}
270pub fn pdf_to_md_bytes(input: &[u8]) -> Result<Vec<u8>> {
271    writer_convert_bytes(input, "pdf", "md")
272}
273pub fn pdf_to_html_bytes(input: &[u8]) -> Result<Vec<u8>> {
274    writer_convert_bytes(input, "pdf", "html")
275}
276pub fn docx_to_odt_bytes(input: &[u8]) -> Result<Vec<u8>> {
277    writer_convert_bytes(input, "docx", "odt")
278}
279pub fn odt_to_pdf_bytes(input: &[u8]) -> Result<Vec<u8>> {
280    writer_convert_bytes(input, "odt", "pdf")
281}
282pub fn odt_to_docx_bytes(input: &[u8]) -> Result<Vec<u8>> {
283    writer_convert_bytes(input, "odt", "docx")
284}
285pub fn odt_to_html_bytes(input: &[u8]) -> Result<Vec<u8>> {
286    writer_convert_bytes(input, "odt", "html")
287}
288
289// ---- Calc shortcuts -------------------------------------------------------
290
291pub fn xlsx_to_pdf_bytes(input: &[u8]) -> Result<Vec<u8>> {
292    calc_convert_bytes(input, "xlsx", "pdf")
293}
294pub fn xlsx_to_html_bytes(input: &[u8]) -> Result<Vec<u8>> {
295    calc_convert_bytes(input, "xlsx", "html")
296}
297pub fn xlsx_to_csv_bytes(input: &[u8]) -> Result<Vec<u8>> {
298    calc_convert_bytes(input, "xlsx", "csv")
299}
300pub fn xlsx_to_ods_bytes(input: &[u8]) -> Result<Vec<u8>> {
301    calc_convert_bytes(input, "xlsx", "ods")
302}
303pub fn ods_to_pdf_bytes(input: &[u8]) -> Result<Vec<u8>> {
304    calc_convert_bytes(input, "ods", "pdf")
305}
306pub fn ods_to_xlsx_bytes(input: &[u8]) -> Result<Vec<u8>> {
307    calc_convert_bytes(input, "ods", "xlsx")
308}
309pub fn ods_to_csv_bytes(input: &[u8]) -> Result<Vec<u8>> {
310    calc_convert_bytes(input, "ods", "csv")
311}
312
313// ---- Impress shortcuts ----------------------------------------------------
314
315pub fn pptx_to_html_bytes(input: &[u8]) -> Result<Vec<u8>> {
316    impress_convert_bytes(input, "pptx", "html")
317}
318pub fn pptx_to_svg_bytes(input: &[u8]) -> Result<Vec<u8>> {
319    impress_convert_bytes(input, "pptx", "svg")
320}
321pub fn pptx_to_odp_bytes(input: &[u8]) -> Result<Vec<u8>> {
322    impress_convert_bytes(input, "pptx", "odp")
323}
324pub fn odp_to_pdf_bytes(input: &[u8]) -> Result<Vec<u8>> {
325    impress_convert_bytes(input, "odp", "pdf")
326}
327pub fn odp_to_pptx_bytes(input: &[u8]) -> Result<Vec<u8>> {
328    impress_convert_bytes(input, "odp", "pptx")
329}
330
331// ---------------------------------------------------------------------------
332// XLSX recalc and report generation
333// ---------------------------------------------------------------------------
334
335#[derive(Clone, Debug, Default, PartialEq, Eq)]
336pub struct RecalcErrorBucket {
337    pub count: usize,
338    pub locations: Vec<String>,
339}
340
341#[derive(Clone, Debug, Default, PartialEq, Eq)]
342pub struct RecalcCheckReport {
343    pub status: String,
344    pub total_errors: usize,
345    pub total_formulas: usize,
346    pub error_summary: BTreeMap<String, RecalcErrorBucket>,
347}
348
349impl RecalcCheckReport {
350    pub fn to_json(&self) -> String {
351        fn esc(input: &str) -> String {
352            input
353                .replace('\\', "\\\\")
354                .replace('"', "\\\"")
355                .replace('\n', "\\n")
356        }
357        let mut json = String::new();
358        json.push('{');
359        json.push_str(&format!("\"status\":\"{}\"", esc(&self.status)));
360        json.push_str(&format!(",\"total_errors\":{}", self.total_errors));
361        json.push_str(&format!(",\"total_formulas\":{}", self.total_formulas));
362        json.push_str(",\"error_summary\":{");
363        let mut first_kind = true;
364        for (kind, bucket) in &self.error_summary {
365            if !first_kind {
366                json.push(',');
367            }
368            first_kind = false;
369            json.push_str(&format!("\"{}\":{{\"count\":{},\"locations\":[", esc(kind), bucket.count));
370            for (index, location) in bucket.locations.iter().enumerate() {
371                if index > 0 {
372                    json.push(',');
373                }
374                json.push_str(&format!("\"{}\"", esc(location)));
375            }
376            json.push_str("]}");
377        }
378        json.push_str("}}");
379        json
380    }
381
382    fn record_error(&mut self, kind: String, location: String) {
383        self.total_errors += 1;
384        let bucket = self.error_summary.entry(kind).or_default();
385        bucket.count += 1;
386        bucket.locations.push(location);
387    }
388}
389
390/// Re-evaluate every formula in an XLSX workbook and rewrite the cached
391/// `<v>` values inside the existing sheet XML. The result is a fresh
392/// XLSX byte stream with the same shape as the input, minus
393/// `xl/calcChain.xml`.
394pub fn xlsx_recalc_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
395    let zip = ZipArchive::new(bytes)?;
396    let workbook = lo_calc::from_xlsx_bytes("workbook", bytes)?;
397    let evaluator = WorkbookEvaluator::new(&workbook);
398    let sheet_targets = parse_xlsx_sheet_targets(&zip)?;
399
400    let mut entries: Vec<lo_zip::ZipEntry> = Vec::new();
401    for entry_name in zip.entries() {
402        let path = normalize_zip_path(entry_name);
403        if path == "xl/calcChain.xml" {
404            continue;
405        }
406        if path == "[Content_Types].xml" {
407            let xml = zip.read_string(&path)?;
408            let mut root = parse_xml_document(&xml)?;
409            remove_content_type_override(&mut root, "/xl/calcChain.xml");
410            entries.push(lo_zip::ZipEntry::new(path, serialize_xml_document(&root).into_bytes()));
411            continue;
412        }
413        if path == "xl/_rels/workbook.xml.rels" {
414            let xml = zip.read_string(&path)?;
415            let mut root = parse_xml_document(&xml)?;
416            remove_calc_chain_relationships(&mut root);
417            entries.push(lo_zip::ZipEntry::new(path, serialize_xml_document(&root).into_bytes()));
418            continue;
419        }
420        if path == "xl/workbook.xml" {
421            let xml = zip.read_string(&path)?;
422            let mut root = parse_xml_document(&xml)?;
423            mark_workbook_recalculated(&mut root);
424            entries.push(lo_zip::ZipEntry::new(path, serialize_xml_document(&root).into_bytes()));
425            continue;
426        }
427        if let Some(sheet_index) = sheet_targets.iter().position(|(target, _)| target == &path) {
428            let xml = zip.read_string(&path)?;
429            let mut root = parse_xml_document(&xml)?;
430            patch_xlsx_sheet_formula_cache(&mut root, &workbook, sheet_index, &evaluator)?;
431            entries.push(lo_zip::ZipEntry::new(path, serialize_xml_document(&root).into_bytes()));
432            continue;
433        }
434        entries.push(lo_zip::ZipEntry::new(path, zip.read(entry_name)?));
435    }
436    lo_zip::ooxml_package(&entries)
437}
438
439/// Produce a Clark-shaped JSON report for an existing XLSX workbook.
440pub fn xlsx_recalc_check_json(bytes: &[u8]) -> Result<String> {
441    Ok(xlsx_recalc_report(bytes)?.to_json())
442}
443
444/// Produce the structured recalc report used by
445/// [`xlsx_recalc_check_json`].
446pub fn xlsx_recalc_report(bytes: &[u8]) -> Result<RecalcCheckReport> {
447    let zip = ZipArchive::new(bytes)?;
448    let workbook = lo_calc::from_xlsx_bytes("workbook", bytes)?;
449    let evaluator = WorkbookEvaluator::new(&workbook);
450    let sheet_targets = parse_xlsx_sheet_targets(&zip)?;
451    let mut report = RecalcCheckReport {
452        status: "ok".to_string(),
453        ..RecalcCheckReport::default()
454    };
455
456    for (sheet_index, (path, sheet_name)) in sheet_targets.iter().enumerate() {
457        if !zip.contains(path) {
458            continue;
459        }
460        let xml = zip.read_string(path)?;
461        let root = parse_xml_document(&xml)?;
462        walk_formula_cells(&root, sheet_name, sheet_index, &evaluator, &mut report)?;
463    }
464
465    if report.total_errors > 0 {
466        report.status = "error".to_string();
467    }
468    Ok(report)
469}
470
471fn parse_xlsx_sheet_targets(zip: &ZipArchive) -> Result<Vec<(String, String)>> {
472    let workbook_root = parse_xml_document(&zip.read_string("xl/workbook.xml")?)?;
473    let rels = parse_relationships(zip, "xl/workbook.xml")?;
474    let mut out = Vec::new();
475    if let Some(sheets) = workbook_root.child("sheets") {
476        for (index, sheet) in sheets.children_named("sheet").enumerate() {
477            let name = sheet.attr("name").unwrap_or("Sheet").to_string();
478            let target = sheet
479                .attr("id")
480                .or_else(|| sheet.attr("r:id"))
481                .and_then(|id| rels.get(id))
482                .cloned()
483                .unwrap_or_else(|| format!("xl/worksheets/sheet{}.xml", index + 1));
484            out.push((normalize_zip_path(&target), name));
485        }
486    }
487    Ok(out)
488}
489
490fn parse_relationships(zip: &ZipArchive, part: &str) -> Result<BTreeMap<String, String>> {
491    let rels_path = rels_path_for(part);
492    if !zip.contains(&rels_path) {
493        return Ok(BTreeMap::new());
494    }
495    let root = parse_xml_document(&zip.read_string(&rels_path)?)?;
496    let mut map = BTreeMap::new();
497    for rel in root.children_named("Relationship") {
498        if let (Some(id), Some(target)) = (rel.attr("Id"), rel.attr("Target")) {
499            map.insert(id.to_string(), resolve_part_target(part, target));
500        }
501    }
502    Ok(map)
503}
504
505fn remove_content_type_override(root: &mut XmlNode, part_name: &str) {
506    root.items.retain(|item| match item {
507        XmlItem::Node(node) if node.local_name() == "Override" => node.attr("PartName") != Some(part_name),
508        _ => true,
509    });
510    sync_node_children(root);
511}
512
513fn remove_calc_chain_relationships(root: &mut XmlNode) {
514    root.items.retain(|item| match item {
515        XmlItem::Node(node) if node.local_name() == "Relationship" => {
516            let target = node.attr("Target").unwrap_or("");
517            let rel_type = node.attr("Type").unwrap_or("");
518            !target.ends_with("calcChain.xml")
519                && !rel_type.to_ascii_lowercase().contains("calcchain")
520        }
521        _ => true,
522    });
523    sync_node_children(root);
524}
525
526fn mark_workbook_recalculated(root: &mut XmlNode) {
527    let mut found = false;
528    for item in &mut root.items {
529        if let XmlItem::Node(node) = item {
530            if node.local_name() == "calcPr" {
531                node.attributes.insert("calcCompleted".to_string(), "1".to_string());
532                node.attributes.insert("fullCalcOnLoad".to_string(), "0".to_string());
533                node.attributes.remove("calcMode");
534                found = true;
535            }
536        }
537    }
538    if !found {
539        let mut attrs = BTreeMap::new();
540        attrs.insert("calcCompleted".to_string(), "1".to_string());
541        attrs.insert("fullCalcOnLoad".to_string(), "0".to_string());
542        root.items.push(XmlItem::Node(XmlNode {
543            name: "calcPr".to_string(),
544            attributes: attrs,
545            children: Vec::new(),
546            items: Vec::new(),
547            text: String::new(),
548        }));
549    }
550    sync_node_children(root);
551}
552
553fn patch_xlsx_sheet_formula_cache(
554    root: &mut XmlNode,
555    workbook: &Workbook,
556    sheet_index: usize,
557    evaluator: &WorkbookEvaluator<'_>,
558) -> Result<()> {
559    let Some(sheet_data) = child_mut(root, "sheetData") else {
560        return Ok(());
561    };
562    let mut shared_formulas: BTreeMap<String, (CellAddr, String)> = BTreeMap::new();
563    for row in &mut sheet_data.children {
564        if row.local_name() != "row" {
565            continue;
566        }
567        let row_number = row
568            .attr("r")
569            .and_then(|value| value.parse::<usize>().ok())
570            .unwrap_or(1);
571        for cell in &mut row.children {
572            if cell.local_name() == "c" {
573                patch_formula_cell(cell, row_number, workbook, sheet_index, evaluator, &mut shared_formulas)?;
574            }
575        }
576        sync_node_items_from_children(row);
577    }
578    sync_node_items_from_children(sheet_data);
579    sync_node_items_from_children(root);
580    Ok(())
581}
582
583fn walk_formula_cells(
584    root: &XmlNode,
585    sheet_name: &str,
586    sheet_index: usize,
587    evaluator: &WorkbookEvaluator<'_>,
588    report: &mut RecalcCheckReport,
589) -> Result<()> {
590    let Some(sheet_data) = root.child("sheetData") else {
591        return Ok(());
592    };
593    let mut shared_formulas: BTreeMap<String, (CellAddr, String)> = BTreeMap::new();
594    for row in &sheet_data.children {
595        if row.local_name() != "row" {
596            continue;
597        }
598        let row_number = row
599            .attr("r")
600            .and_then(|value| value.parse::<usize>().ok())
601            .unwrap_or(1);
602        for cell in &row.children {
603            if cell.local_name() != "c" {
604                continue;
605            }
606            let (row_1, col_1) = cell
607                .attr("r")
608                .and_then(parse_a1_cell_ref)
609                .unwrap_or((row_number, 1));
610            let addr = CellAddr::new(row_1.saturating_sub(1) as u32, col_1.saturating_sub(1) as u32);
611            let Some(formula) = resolve_formula_for_cell(cell, addr, &mut shared_formulas) else {
612                continue;
613            };
614            report.total_formulas += 1;
615            let value = evaluator
616                .evaluate_formula(sheet_index, &formula)
617                .unwrap_or_else(|_| EvalValue::Error("#VALUE!".to_string()));
618            if let EvalValue::Error(kind) = value {
619                report.record_error(kind, format!("{}!{}", sheet_name, addr.to_a1()));
620            }
621        }
622    }
623    Ok(())
624}
625
626fn patch_formula_cell(
627    cell: &mut XmlNode,
628    fallback_row: usize,
629    workbook: &Workbook,
630    sheet_index: usize,
631    evaluator: &WorkbookEvaluator<'_>,
632    shared_formulas: &mut BTreeMap<String, (CellAddr, String)>,
633) -> Result<()> {
634    let (row_1, col_1) = cell
635        .attr("r")
636        .and_then(parse_a1_cell_ref)
637        .unwrap_or((fallback_row, 1));
638    let addr = CellAddr::new(row_1.saturating_sub(1) as u32, col_1.saturating_sub(1) as u32);
639    let Some(formula) = resolve_formula_for_cell(cell, addr, shared_formulas) else {
640        return Ok(());
641    };
642    if formula.trim().is_empty() {
643        return Ok(());
644    }
645    let _ = workbook; // kept for signature symmetry and future named-range resolution.
646    let value = evaluator
647        .evaluate_formula(sheet_index, &formula)
648        .unwrap_or_else(|_| EvalValue::Error("#VALUE!".to_string()));
649    let mut new_items = Vec::new();
650    for item in &cell.items {
651        match item {
652            XmlItem::Text(text) => new_items.push(XmlItem::Text(text.clone())),
653            XmlItem::Node(node) if matches!(node.local_name(), "v" | "is") => {}
654            XmlItem::Node(node) => new_items.push(XmlItem::Node(node.clone())),
655        }
656    }
657    new_items.push(XmlItem::Node(make_value_node(&value)));
658    cell.items = new_items;
659    sync_node_children(cell);
660    apply_formula_cache_type(cell, &value);
661    Ok(())
662}
663
664fn resolve_formula_for_cell(
665    cell: &XmlNode,
666    addr: CellAddr,
667    shared_formulas: &mut BTreeMap<String, (CellAddr, String)>,
668) -> Option<String> {
669    let mut formula_text = None;
670    let mut formula_kind = None;
671    let mut shared_index = None;
672    for child in &cell.children {
673        if child.local_name() == "f" {
674            formula_text = Some(text_content(child));
675            formula_kind = child.attr("t").map(str::to_string);
676            shared_index = child.attr("si").map(str::to_string);
677            break;
678        }
679    }
680    let text = formula_text.unwrap_or_default();
681    if !text.trim().is_empty() {
682        if formula_kind.as_deref() == Some("shared") {
683            if let Some(si) = shared_index.clone() {
684                shared_formulas.insert(si, (addr, text.clone()));
685            }
686        }
687        return Some(text);
688    }
689    if formula_kind.as_deref() == Some("shared") {
690        if let Some(si) = shared_index {
691            if let Some((base_addr, base_formula)) = shared_formulas.get(&si) {
692                return Some(translate_shared_formula(base_formula, *base_addr, addr));
693            }
694        }
695    }
696    None
697}
698
699fn apply_formula_cache_type(cell: &mut XmlNode, value: &EvalValue) {
700    match value {
701        EvalValue::Number(_) | EvalValue::Blank => {
702            cell.attributes.remove("t");
703        }
704        EvalValue::Text(_) => {
705            cell.attributes.insert("t".to_string(), "str".to_string());
706        }
707        EvalValue::Bool(_) => {
708            cell.attributes.insert("t".to_string(), "b".to_string());
709        }
710        EvalValue::Error(_) => {
711            cell.attributes.insert("t".to_string(), "e".to_string());
712        }
713    }
714}
715
716fn make_value_node(value: &EvalValue) -> XmlNode {
717    let text = match value {
718        EvalValue::Blank => String::new(),
719        EvalValue::Number(number) => {
720            if number.fract() == 0.0 && number.is_finite() {
721                format!("{}", *number as i64)
722            } else {
723                number.to_string()
724            }
725        }
726        EvalValue::Text(text) => text.clone(),
727        EvalValue::Bool(value) => {
728            if *value { "1".to_string() } else { "0".to_string() }
729        }
730        EvalValue::Error(text) => text.clone(),
731    };
732    XmlNode {
733        name: "v".to_string(),
734        attributes: BTreeMap::new(),
735        children: Vec::new(),
736        items: if text.is_empty() { Vec::new() } else { vec![XmlItem::Text(text.clone())] },
737        text,
738    }
739}
740
741fn parse_a1_cell_ref(input: &str) -> Option<(usize, usize)> {
742    let mut letters = String::new();
743    let mut digits = String::new();
744    for ch in input.chars() {
745        if ch == '$' {
746            continue;
747        }
748        if ch.is_ascii_alphabetic() && digits.is_empty() {
749            letters.push(ch);
750        } else if ch.is_ascii_digit() {
751            digits.push(ch);
752        } else {
753            return None;
754        }
755    }
756    if letters.is_empty() || digits.is_empty() {
757        return None;
758    }
759    let row = digits.parse().ok()?;
760    let mut col = 0usize;
761    for ch in letters.chars() {
762        col = col * 26 + ((ch.to_ascii_uppercase() as u8 - b'A' + 1) as usize);
763    }
764    Some((row, col))
765}
766
767fn text_content(node: &XmlNode) -> String {
768    let mut out = String::new();
769    if !node.text.is_empty() {
770        out.push_str(&node.text);
771    }
772    for child in &node.children {
773        out.push_str(&text_content(child));
774    }
775    out
776}
777
778// ---------------------------------------------------------------------------
779// Accept all tracked changes
780// ---------------------------------------------------------------------------
781
782/// Walk every WordprocessingML part inside a DOCX, accept all common
783/// tracked revisions, then re-emit the package.
784///
785/// This keeps inserted content (`w:ins`, `w:moveTo`), drops deleted
786/// content (`w:del`, `w:moveFrom`, deleted rows/cells), strips
787/// formatting-history `*Change` elements, removes `w:trackRevisions`
788/// from settings, and prunes unreferenced comments from
789/// `word/comments.xml`.
790pub fn accept_all_tracked_changes_docx_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
791    let zip = ZipArchive::new(bytes)?;
792    let mut xml_parts: Vec<(String, XmlNode)> = Vec::new();
793    let mut passthrough: Vec<lo_zip::ZipEntry> = Vec::new();
794
795    for entry_name in zip.entries() {
796        let path = normalize_zip_path(entry_name);
797        if is_wordprocessing_xml(&path) {
798            let xml = zip.read_string(&path)?;
799            let root = parse_xml_document(&xml)?;
800            let accepted = accept_revision_root(&root, &path);
801            xml_parts.push((path, accepted));
802        } else {
803            passthrough.push(lo_zip::ZipEntry::new(path, zip.read(entry_name)?));
804        }
805    }
806
807    let mut live_comment_ids = BTreeSet::new();
808    for (path, root) in &xml_parts {
809        if !path.ends_with("comments.xml") {
810            collect_comment_ids(root, &mut live_comment_ids);
811        }
812    }
813
814    let mut entries: Vec<lo_zip::ZipEntry> =
815        Vec::with_capacity(xml_parts.len() + passthrough.len());
816    for (path, root) in xml_parts {
817        let root = if path.ends_with("comments.xml") {
818            filter_comment_part(&root, &live_comment_ids)
819        } else {
820            root
821        };
822        entries.push(lo_zip::ZipEntry::new(
823            path,
824            serialize_xml_document(&root).into_bytes(),
825        ));
826    }
827    entries.extend(passthrough);
828    lo_zip::ooxml_package(&entries)
829}
830
831/// Back-compat alias; prefer [`accept_all_tracked_changes_docx_bytes`].
832#[deprecated(note = "use accept_all_tracked_changes_docx_bytes")]
833pub fn accept_tracked_changes_docx_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
834    accept_all_tracked_changes_docx_bytes(bytes)
835}
836
837/// Back-compat alias; prefer [`xlsx_recalc_bytes`].
838#[deprecated(note = "use xlsx_recalc_bytes")]
839pub fn recalc_existing_xlsx_bytes(bytes: &[u8]) -> Result<Vec<u8>> {
840    xlsx_recalc_bytes(bytes)
841}
842
843fn is_wordprocessing_xml(path: &str) -> bool {
844    path.starts_with("word/")
845        && path.ends_with(".xml")
846        && !path.contains("_rels/")
847        && !path.ends_with("fontTable.xml")
848}
849
850fn accept_revision_root(root: &XmlNode, path: &str) -> XmlNode {
851    let items = accept_revision_items(&root.items);
852    let mut node = rebuild_node(root, items, root.attributes.clone());
853    if path.ends_with("settings.xml") {
854        node.items.retain(
855            |item| !matches!(item, XmlItem::Node(child) if child.local_name() == "trackRevisions"),
856        );
857        sync_node_children(&mut node);
858    }
859    node
860}
861
862fn accept_revision_items(items: &[XmlItem]) -> Vec<XmlItem> {
863    let mut out = Vec::new();
864    for item in items {
865        match item {
866            XmlItem::Text(text) => out.push(XmlItem::Text(text.clone())),
867            XmlItem::Node(node) => out.extend(accept_revision_node(node)),
868        }
869    }
870    out
871}
872
873fn accept_revision_node(node: &XmlNode) -> Vec<XmlItem> {
874    let local = node.local_name();
875    if matches!(
876        local,
877        "del"
878            | "delText"
879            | "delInstrText"
880            | "cellDel"
881            | "moveFrom"
882            | "moveFromRangeStart"
883            | "moveFromRangeEnd"
884            | "moveToRangeStart"
885            | "moveToRangeEnd"
886            | "customXmlDelRangeStart"
887            | "customXmlDelRangeEnd"
888            | "customXmlMoveFromRangeStart"
889            | "customXmlMoveFromRangeEnd"
890            | "customXmlMoveToRangeStart"
891            | "customXmlMoveToRangeEnd"
892            | "trackRevisions"
893    ) {
894        return Vec::new();
895    }
896    if matches!(
897        local,
898        "ins"
899            | "moveTo"
900            | "customXmlInsRangeStart"
901            | "customXmlInsRangeEnd"
902            | "cellIns"
903    ) {
904        return accept_revision_items(&node.items);
905    }
906    if local.ends_with("Change") || local == "numberingChange" || local == "cellMerge" {
907        return Vec::new();
908    }
909    if row_deleted(node) || cell_deleted(node) {
910        return Vec::new();
911    }
912    let items = accept_revision_items(&node.items);
913    vec![XmlItem::Node(rebuild_node(node, items, node.attributes.clone()))]
914}
915
916fn row_deleted(node: &XmlNode) -> bool {
917    if node.local_name() != "tr" {
918        return false;
919    }
920    node.child("trPr")
921        .map(|trpr| {
922            trpr.children
923                .iter()
924                .any(|child| matches!(child.local_name(), "del" | "cellDel" | "cellMerge"))
925        })
926        .unwrap_or(false)
927}
928
929fn cell_deleted(node: &XmlNode) -> bool {
930    if node.local_name() != "tc" {
931        return false;
932    }
933    node.child("tcPr")
934        .map(|tcpr| {
935            tcpr.children
936                .iter()
937                .any(|child| matches!(child.local_name(), "cellDel" | "del"))
938        })
939        .unwrap_or(false)
940}
941
942fn collect_comment_ids(node: &XmlNode, out: &mut BTreeSet<String>) {
943    let local = node.local_name();
944    if matches!(
945        local,
946        "commentRangeStart" | "commentRangeEnd" | "commentReference"
947    ) {
948        if let Some(id) = attribute_local(node, "id") {
949            out.insert(id.to_string());
950        }
951    }
952    for child in &node.children {
953        collect_comment_ids(child, out);
954    }
955}
956
957fn filter_comment_part(root: &XmlNode, live_comment_ids: &BTreeSet<String>) -> XmlNode {
958    if root.local_name() != "comments" {
959        return root.clone();
960    }
961    let items = root
962        .items
963        .iter()
964        .filter_map(|item| match item {
965            XmlItem::Text(text) => Some(XmlItem::Text(text.clone())),
966            XmlItem::Node(node) => filter_comment_node(node, live_comment_ids).map(XmlItem::Node),
967        })
968        .collect();
969    rebuild_node(root, items, root.attributes.clone())
970}
971
972fn filter_comment_node(node: &XmlNode, live_comment_ids: &BTreeSet<String>) -> Option<XmlNode> {
973    if node.local_name() == "comment" {
974        let keep = attribute_local(node, "id")
975            .map(|id| live_comment_ids.contains(id))
976            .unwrap_or(true);
977        if !keep {
978            return None;
979        }
980    }
981    let items = node
982        .items
983        .iter()
984        .filter_map(|item| match item {
985            XmlItem::Text(text) => Some(XmlItem::Text(text.clone())),
986            XmlItem::Node(child) => filter_comment_node(child, live_comment_ids).map(XmlItem::Node),
987        })
988        .collect();
989    Some(rebuild_node(node, items, node.attributes.clone()))
990}
991
992fn attribute_local<'a>(node: &'a XmlNode, local_name: &str) -> Option<&'a str> {
993    let suffix = format!(":{local_name}");
994    node.attributes.iter().find_map(|(key, value)| {
995        if key == local_name || key.ends_with(&suffix) {
996            Some(value.as_str())
997        } else {
998            None
999        }
1000    })
1001}
1002
1003// ---------------------------------------------------------------------------
1004// Shared XmlNode mutation helpers
1005// ---------------------------------------------------------------------------
1006
1007fn rebuild_node(
1008    template: &XmlNode,
1009    items: Vec<XmlItem>,
1010    attributes: BTreeMap<String, String>,
1011) -> XmlNode {
1012    let mut node = XmlNode {
1013        name: template.name.clone(),
1014        attributes,
1015        children: Vec::new(),
1016        items,
1017        text: String::new(),
1018    };
1019    sync_node_children(&mut node);
1020    node
1021}
1022
1023fn sync_node_children(node: &mut XmlNode) {
1024    node.children = node
1025        .items
1026        .iter()
1027        .filter_map(|item| match item {
1028            XmlItem::Node(child) => Some(child.clone()),
1029            _ => None,
1030        })
1031        .collect();
1032    node.text = node
1033        .items
1034        .iter()
1035        .filter_map(|item| match item {
1036            XmlItem::Text(text) => Some(text.clone()),
1037            _ => None,
1038        })
1039        .collect::<Vec<_>>()
1040        .join("");
1041}
1042
1043fn sync_node_items_from_children(node: &mut XmlNode) {
1044    let mut child_index = 0usize;
1045    let mut new_items = Vec::with_capacity(node.items.len().max(node.children.len()));
1046    for item in &node.items {
1047        match item {
1048            XmlItem::Text(text) => new_items.push(XmlItem::Text(text.clone())),
1049            XmlItem::Node(_) => {
1050                if let Some(updated) = node.children.get(child_index) {
1051                    new_items.push(XmlItem::Node(updated.clone()));
1052                    child_index += 1;
1053                }
1054            }
1055        }
1056    }
1057    while let Some(updated) = node.children.get(child_index) {
1058        new_items.push(XmlItem::Node(updated.clone()));
1059        child_index += 1;
1060    }
1061    node.items = new_items;
1062    sync_node_children(node);
1063}
1064
1065fn child_mut<'a>(node: &'a mut XmlNode, name: &str) -> Option<&'a mut XmlNode> {
1066    node.children
1067        .iter_mut()
1068        .find(|child| child.local_name() == name || child.name == name)
1069}
1070
1071#[allow(dead_code)]
1072fn _assert_send_sync() {
1073    fn assert<T: Send + Sync>() {}
1074    assert::<Result<Vec<u8>>>();
1075    let _ = LoError::Parse(String::new());
1076}
1077
1078
1079// ---- Markdown extraction --------------------------------------------------
1080
1081/// Extract Markdown from an existing DOCX file using the native Writer importer.
1082pub fn docx_to_md_bytes(input: &[u8]) -> Result<Vec<u8>> {
1083    let doc = lo_writer::from_docx_bytes("document", input)?;
1084    Ok(lo_writer::to_markdown(&doc).into_bytes())
1085}
1086
1087/// Extract Markdown from an existing PPTX file using the native Impress importer.
1088pub fn pptx_to_md_bytes(input: &[u8]) -> Result<Vec<u8>> {
1089    let deck = lo_impress::from_pptx_bytes("presentation", input)?;
1090    Ok(lo_impress::to_markdown(&deck).into_bytes())
1091}
1092
1093/// Extract Markdown from an existing XLSX file using the native Calc importer.
1094pub fn xlsx_to_md_bytes(input: &[u8]) -> Result<Vec<u8>> {
1095    let workbook = lo_calc::from_xlsx_bytes("workbook", input)?;
1096    Ok(lo_calc::to_markdown(&workbook).into_bytes())
1097}
1098
1099// ---- Direct raster output -------------------------------------------------
1100
1101/// Rasterize a DOCX document directly to PNG pages at the requested DPI.
1102pub fn docx_to_png_pages(input: &[u8], dpi: u32) -> Result<Vec<Vec<u8>>> {
1103    let doc = lo_writer::from_docx_bytes("document", input)?;
1104    Ok(lo_writer::render_png_pages(&doc, dpi.max(72)))
1105}
1106
1107/// Rasterize a DOCX document directly to JPEG pages at the requested DPI.
1108pub fn docx_to_jpeg_pages(input: &[u8], dpi: u32, quality: u8) -> Result<Vec<Vec<u8>>> {
1109    let doc = lo_writer::from_docx_bytes("document", input)?;
1110    Ok(lo_writer::render_jpeg_pages(&doc, dpi.max(72), quality.max(1)))
1111}
1112
1113/// Rasterize a PPTX deck directly to PNG slide images at the requested DPI.
1114pub fn pptx_to_png_pages(input: &[u8], dpi: u32) -> Result<Vec<Vec<u8>>> {
1115    let deck = lo_impress::from_pptx_bytes("presentation", input)?;
1116    Ok(lo_impress::render_png_pages(&deck, dpi.max(72)))
1117}
1118
1119/// Rasterize a PPTX deck directly to JPEG slide images at the requested DPI.
1120pub fn pptx_to_jpeg_pages(input: &[u8], dpi: u32, quality: u8) -> Result<Vec<Vec<u8>>> {
1121    let deck = lo_impress::from_pptx_bytes("presentation", input)?;
1122    Ok(lo_impress::render_jpeg_pages(&deck, dpi.max(72), quality.max(1)))
1123}