Skip to main content

oxdoc_core/
lib.rs

1//! Core OOXML extraction APIs for `oxdoc`.
2//!
3//! This crate reads Office Open XML packages without rendering them. It exposes
4//! path-based helpers for DOCX/PPTX text extraction, XLSX-to-CSV extraction,
5//! and package metadata. Extraction returns useful output plus recoverable warnings,
6//! while unrecoverable package and parser failures are returned as typed errors.
7//!
8//! ```no_run
9//! # fn demo() -> oxdoc_core::Result<()> {
10//! let extraction = oxdoc_core::extract_docx_text("contract.docx")?;
11//! println!("{}", extraction.value);
12//! # Ok(())
13//! # }
14//! ```
15//!
16//! The public API follows semantic versioning from 1.0 onward.
17
18mod error;
19pub mod models;
20mod parsers;
21pub mod vfs;
22
23use std::fs::File;
24use std::io::{Cursor, Read, Seek, Write};
25use std::path::Path;
26
27pub use error::{OxdocError, Result};
28pub use models::{
29    DocumentInfo, DocumentType, Extraction, OutputWarning, XlsxCsvOptions, XlsxSheet,
30};
31#[doc(hidden)]
32pub use parsers::docx::fuzz_extract_text as fuzz_docx_text;
33#[doc(hidden)]
34pub use parsers::fuzz_parse_relationships as fuzz_relationships;
35#[doc(hidden)]
36pub use parsers::metadata::fuzz_parse_metadata as fuzz_metadata;
37#[doc(hidden)]
38pub use parsers::pptx::fuzz_extract_text as fuzz_pptx_text;
39#[doc(hidden)]
40pub use parsers::xlsx::{fuzz_parse_shared_strings, fuzz_parse_sheet};
41use parsers::{docx, metadata, pptx, xlsx};
42use vfs::OoxmlPackage;
43
44pub fn extract_docx_text(path: impl AsRef<Path>) -> Result<Extraction<String>> {
45    let file = File::open(path)?;
46    extract_docx_text_from_reader(file)
47}
48
49pub fn extract_docx_text_from_reader<R: Read + Seek>(reader: R) -> Result<Extraction<String>> {
50    let mut package = OoxmlPackage::new(reader)?;
51    docx::extract_text(&mut package)
52}
53
54pub fn extract_pptx_text(path: impl AsRef<Path>) -> Result<Extraction<String>> {
55    let file = File::open(path)?;
56    extract_pptx_text_from_reader(file)
57}
58
59pub fn extract_pptx_text_from_reader<R: Read + Seek>(reader: R) -> Result<Extraction<String>> {
60    let mut package = OoxmlPackage::new(reader)?;
61    pptx::extract_text(&mut package)
62}
63
64pub fn extract_xlsx_csv<W: Write>(
65    path: impl AsRef<Path>,
66    options: XlsxCsvOptions<'_>,
67    writer: W,
68) -> Result<Extraction<()>> {
69    let file = File::open(path)?;
70    extract_xlsx_csv_from_reader(file, options, writer)
71}
72
73pub fn extract_xlsx_csv_from_reader<R: Read + Seek, W: Write>(
74    reader: R,
75    options: XlsxCsvOptions<'_>,
76    writer: W,
77) -> Result<Extraction<()>> {
78    let mut package = OoxmlPackage::new(reader)?;
79    xlsx::write_csv(&mut package, options, writer)
80}
81
82pub fn list_xlsx_sheets(path: impl AsRef<Path>) -> Result<Extraction<Vec<XlsxSheet>>> {
83    let file = File::open(path)?;
84    list_xlsx_sheets_from_reader(file)
85}
86
87pub fn list_xlsx_sheets_from_reader<R: Read + Seek>(
88    reader: R,
89) -> Result<Extraction<Vec<XlsxSheet>>> {
90    let mut package = OoxmlPackage::new(reader)?;
91    xlsx::list_sheets(&mut package)
92}
93
94pub fn detect_document_type(path: impl AsRef<Path>) -> Result<DocumentType> {
95    let file = File::open(path)?;
96    detect_document_type_from_reader(file)
97}
98
99pub fn detect_document_type_from_reader<R: Read>(mut reader: R) -> Result<DocumentType> {
100    let mut bytes = Vec::new();
101    reader.read_to_end(&mut bytes)?;
102    let mut package = OoxmlPackage::new(Cursor::new(bytes))?;
103    parsers::detect_document_type(&mut package)
104}
105
106pub fn read_info(path: impl AsRef<Path>) -> Result<Extraction<DocumentInfo>> {
107    let path = path.as_ref();
108    let file_name = path
109        .file_name()
110        .and_then(|name| name.to_str())
111        .unwrap_or_default()
112        .to_owned();
113
114    let file = File::open(path)?;
115    read_info_from_reader(file, file_name)
116}
117
118pub fn read_info_from_reader<R: Read + Seek>(
119    reader: R,
120    file_name: impl Into<String>,
121) -> Result<Extraction<DocumentInfo>> {
122    let mut package = OoxmlPackage::new(reader)?;
123    metadata::read_info(&mut package, file_name.into())
124}