Skip to main content

oxdoc_core/
lib.rs

1//! Core OOXML extraction APIs for `oxdoc`.
2//!
3//! This crate reads Office Open XML packages without rendering them. It exposes
4//! path-based helpers for DOCX/PPTX text extraction, XLSX-to-CSV extraction,
5//! and package metadata. Extraction returns useful output plus recoverable warnings,
6//! while unrecoverable package and parser failures are returned as typed errors.
7//!
8//! ```no_run
9//! # fn demo() -> oxdoc_core::Result<()> {
10//! let extraction = oxdoc_core::extract_docx_text("contract.docx")?;
11//! println!("{}", extraction.value);
12//! # Ok(())
13//! # }
14//! ```
15//!
16//! The public API follows semantic versioning from 1.0 onward.
17
18mod error;
19pub mod models;
20mod parsers;
21pub mod vfs;
22
23use std::fs::File;
24use std::io::{Cursor, Read, Seek, Write};
25use std::path::Path;
26
27pub use error::{OxdocError, Result};
28pub use models::{
29    AuditSignal, DocumentAudit, DocumentInfo, DocumentType, Extraction, OutputWarning,
30    StructuredText, TextBlock, XlsxCsvOptions, XlsxSheet, XlsxSheetVisibility, XlsxValueMode,
31};
32#[doc(hidden)]
33pub use parsers::docx::fuzz_extract_text as fuzz_docx_text;
34#[doc(hidden)]
35pub use parsers::fuzz_parse_relationships as fuzz_relationships;
36#[doc(hidden)]
37pub use parsers::metadata::fuzz_parse_metadata as fuzz_metadata;
38#[doc(hidden)]
39pub use parsers::pptx::fuzz_extract_text as fuzz_pptx_text;
40#[doc(hidden)]
41pub use parsers::xlsx::{fuzz_parse_shared_strings, fuzz_parse_sheet};
42use parsers::{docx, metadata, pptx, xlsx};
43use vfs::OoxmlPackage;
44
45pub fn extract_docx_text(path: impl AsRef<Path>) -> Result<Extraction<String>> {
46    let file = File::open(path)?;
47    extract_docx_text_from_reader(file)
48}
49
50pub fn extract_docx_text_from_reader<R: Read + Seek>(reader: R) -> Result<Extraction<String>> {
51    let mut package = OoxmlPackage::new(reader)?;
52    docx::extract_text(&mut package)
53}
54
55pub fn extract_docx_structured_text(path: impl AsRef<Path>) -> Result<Extraction<StructuredText>> {
56    let file = File::open(path)?;
57    extract_docx_structured_text_from_reader(file)
58}
59
60pub fn extract_docx_structured_text_from_reader<R: Read + Seek>(
61    reader: R,
62) -> Result<Extraction<StructuredText>> {
63    let mut package = OoxmlPackage::new(reader)?;
64    docx::extract_structured_text(&mut package)
65}
66
67pub fn extract_pptx_text(path: impl AsRef<Path>) -> Result<Extraction<String>> {
68    let file = File::open(path)?;
69    extract_pptx_text_from_reader(file)
70}
71
72pub fn extract_pptx_text_from_reader<R: Read + Seek>(reader: R) -> Result<Extraction<String>> {
73    let mut package = OoxmlPackage::new(reader)?;
74    pptx::extract_text(&mut package)
75}
76
77pub fn extract_pptx_structured_text(path: impl AsRef<Path>) -> Result<Extraction<StructuredText>> {
78    let file = File::open(path)?;
79    extract_pptx_structured_text_from_reader(file)
80}
81
82pub fn extract_pptx_structured_text_from_reader<R: Read + Seek>(
83    reader: R,
84) -> Result<Extraction<StructuredText>> {
85    let mut package = OoxmlPackage::new(reader)?;
86    pptx::extract_structured_text(&mut package)
87}
88
89pub fn extract_xlsx_csv<W: Write>(
90    path: impl AsRef<Path>,
91    options: XlsxCsvOptions<'_>,
92    writer: W,
93) -> Result<Extraction<()>> {
94    let file = File::open(path)?;
95    extract_xlsx_csv_from_reader(file, options, writer)
96}
97
98pub fn extract_xlsx_csv_with_value_mode<W: Write>(
99    path: impl AsRef<Path>,
100    options: XlsxCsvOptions<'_>,
101    value_mode: XlsxValueMode,
102    writer: W,
103) -> Result<Extraction<()>> {
104    let file = File::open(path)?;
105    extract_xlsx_csv_from_reader_with_value_mode(file, options, value_mode, writer)
106}
107
108pub fn extract_xlsx_csv_from_reader<R: Read + Seek, W: Write>(
109    reader: R,
110    options: XlsxCsvOptions<'_>,
111    writer: W,
112) -> Result<Extraction<()>> {
113    extract_xlsx_csv_from_reader_with_value_mode(reader, options, XlsxValueMode::Raw, writer)
114}
115
116pub fn extract_xlsx_csv_from_reader_with_value_mode<R: Read + Seek, W: Write>(
117    reader: R,
118    options: XlsxCsvOptions<'_>,
119    value_mode: XlsxValueMode,
120    writer: W,
121) -> Result<Extraction<()>> {
122    let mut package = OoxmlPackage::new(reader)?;
123    xlsx::write_csv(&mut package, options, value_mode, writer)
124}
125
126pub fn list_xlsx_sheets(path: impl AsRef<Path>) -> Result<Extraction<Vec<XlsxSheet>>> {
127    let file = File::open(path)?;
128    list_xlsx_sheets_from_reader(file)
129}
130
131pub fn list_xlsx_sheets_from_reader<R: Read + Seek>(
132    reader: R,
133) -> Result<Extraction<Vec<XlsxSheet>>> {
134    list_xlsx_sheets_from_reader_with_hidden(reader, false)
135}
136
137pub fn list_xlsx_sheets_with_hidden(
138    path: impl AsRef<Path>,
139    include_hidden: bool,
140) -> Result<Extraction<Vec<XlsxSheet>>> {
141    let file = File::open(path)?;
142    list_xlsx_sheets_from_reader_with_hidden(file, include_hidden)
143}
144
145pub fn list_xlsx_sheets_from_reader_with_hidden<R: Read + Seek>(
146    reader: R,
147    include_hidden: bool,
148) -> Result<Extraction<Vec<XlsxSheet>>> {
149    let mut package = OoxmlPackage::new(reader)?;
150    xlsx::list_sheets(&mut package, include_hidden)
151}
152
153pub fn detect_document_type(path: impl AsRef<Path>) -> Result<DocumentType> {
154    let file = File::open(path)?;
155    detect_document_type_from_reader(file)
156}
157
158pub fn detect_document_type_from_reader<R: Read>(mut reader: R) -> Result<DocumentType> {
159    let mut bytes = Vec::new();
160    reader.read_to_end(&mut bytes)?;
161    let mut package = OoxmlPackage::new(Cursor::new(bytes))?;
162    parsers::detect_document_type(&mut package)
163}
164
165pub fn read_info(path: impl AsRef<Path>) -> Result<Extraction<DocumentInfo>> {
166    let path = path.as_ref();
167    let file_name = path
168        .file_name()
169        .and_then(|name| name.to_str())
170        .unwrap_or_default()
171        .to_owned();
172
173    let file = File::open(path)?;
174    read_info_from_reader(file, file_name)
175}
176
177pub fn read_info_from_reader<R: Read + Seek>(
178    reader: R,
179    file_name: impl Into<String>,
180) -> Result<Extraction<DocumentInfo>> {
181    let mut package = OoxmlPackage::new(reader)?;
182    metadata::read_info(&mut package, file_name.into())
183}
184
185pub fn read_audit(path: impl AsRef<Path>) -> Result<Extraction<DocumentAudit>> {
186    let path = path.as_ref();
187    let file_name = path
188        .file_name()
189        .and_then(|name| name.to_str())
190        .unwrap_or_default()
191        .to_owned();
192
193    let file = File::open(path)?;
194    read_audit_from_reader(file, file_name)
195}
196
197pub fn read_audit_from_reader<R: Read + Seek>(
198    reader: R,
199    file_name: impl Into<String>,
200) -> Result<Extraction<DocumentAudit>> {
201    let mut package = OoxmlPackage::new(reader)?;
202    parsers::audit::read_audit(&mut package, file_name.into())
203}