use super::package::{DocError, Result};
use super::paragraph::{Paragraph, Run};
use super::parts::fib::FileInformationBlock;
use super::parts::text::TextExtractor;
use super::parts::paragraph_extractor::ParagraphExtractor;
use super::parts::fields::FieldsTable;
use super::parts::pap::ParagraphProperties;
use super::parts::chp::CharacterProperties;
use super::table::Table;
use super::super::OleFile;
#[cfg(feature = "formula")]
use crate::ole::mtef_extractor::MtefExtractor;
use std::collections::HashMap;
use std::io::{Read, Seek};
pub struct Document {
fib: FileInformationBlock,
word_document: Vec<u8>,
table_stream: Vec<u8>,
text_extractor: TextExtractor,
#[allow(dead_code)] fields_table: Option<FieldsTable>,
#[allow(dead_code)] mtef_data: std::collections::HashMap<String, Vec<u8>>,
#[cfg(feature = "formula")]
parsed_mtef: std::collections::HashMap<String, Vec<crate::formula::MathNode<'static>>>,
#[cfg(not(feature = "formula"))]
parsed_mtef: std::collections::HashMap<String, Vec<()>>,
}
impl Document {
pub(crate) fn from_ole<R: Read + Seek>(ole: &mut OleFile<R>) -> Result<Self> {
let word_document = ole
.open_stream(&["WordDocument"])
.map_err(|_| DocError::StreamNotFound("WordDocument".to_string()))?;
let fib = FileInformationBlock::parse(&word_document)?;
let table_stream_name = if fib.which_table_stream() { "1Table" } else { "0Table" };
let table_stream = ole
.open_stream(&[table_stream_name])
.map_err(|_| DocError::StreamNotFound(table_stream_name.to_string()))?;
let text_extractor = TextExtractor::new(&fib, &word_document, &table_stream)?;
let fields_table = FieldsTable::parse(&fib, &table_stream).ok();
let mtef_data = Self::extract_mtef_data(ole)?;
let parsed_mtef = Self::parse_all_mtef_data(&mtef_data)?;
Ok(Self {
fib,
word_document,
table_stream,
text_extractor,
fields_table,
mtef_data,
parsed_mtef,
})
}
#[cfg(feature = "formula")]
fn extract_mtef_data<R: Read + Seek>(ole: &mut OleFile<R>) -> Result<HashMap<String, Vec<u8>>> {
let mtef_data = MtefExtractor::extract_all_mtef_from_objectpool(ole)
.map_err(|e| DocError::InvalidFormat(format!("Failed to extract MTEF data: {}", e)))?;
let mut all_mtef = mtef_data;
let direct_stream_names = [
"Equation Native",
"MSWordEquation",
"Equation.3",
];
for stream_name in &direct_stream_names {
if let Ok(Some(data)) = MtefExtractor::extract_mtef_from_stream(ole, &[stream_name]) {
all_mtef.insert(stream_name.to_string(), data);
}
}
Ok(all_mtef)
}
#[cfg(not(feature = "formula"))]
fn extract_mtef_data<R: Read + Seek>(_ole: &mut OleFile<R>) -> Result<HashMap<String, Vec<u8>>> {
Ok(HashMap::new())
}
#[cfg(feature = "formula")]
fn parse_all_mtef_data(mtef_data: &HashMap<String, Vec<u8>>) -> Result<HashMap<String, Vec<crate::formula::MathNode<'static>>>> {
let mut parsed_mtef = HashMap::new();
for (stream_name, data) in mtef_data {
let formula = crate::formula::Formula::new();
let arena_box = Box::new(formula);
let arena_ptr = Box::leak(arena_box);
let data_box = data.clone().into_boxed_slice();
let data_ptr: &'static [u8] = Box::leak(data_box);
let mut parser = crate::formula::MtefParser::new(arena_ptr.arena(), data_ptr);
eprintln!("DEBUG: Parsing MTEF stream '{}', {} bytes, is_valid={}", stream_name, data.len(), parser.is_valid());
if parser.is_valid() {
match parser.parse() {
Ok(nodes) if !nodes.is_empty() => {
parsed_mtef.insert(stream_name.clone(), nodes);
}
Ok(_) => {
}
Err(e) => {
let placeholder_formula = crate::formula::Formula::new();
let placeholder_arena = Box::leak(Box::new(placeholder_formula));
let error_text = placeholder_arena.arena().alloc_str(&format!("[Formula parsing error: {}]", e));
parsed_mtef.insert(stream_name.clone(), vec![crate::formula::MathNode::Text(
std::borrow::Cow::Borrowed(error_text)
)]);
}
}
} else {
let placeholder_formula = crate::formula::Formula::new();
let placeholder_arena = Box::leak(Box::new(placeholder_formula));
let error_text = placeholder_arena.arena().alloc_str(&format!("[Invalid MTEF format ({} bytes)]", data.len()));
parsed_mtef.insert(stream_name.clone(), vec![crate::formula::MathNode::Text(
std::borrow::Cow::Borrowed(error_text)
)]);
}
}
Ok(parsed_mtef)
}
#[cfg(not(feature = "formula"))]
fn parse_all_mtef_data(_mtef_data: &HashMap<String, Vec<u8>>) -> Result<HashMap<String, Vec<()>>> {
Ok(HashMap::new())
}
fn is_potential_mtef_formula(text: &str) -> bool {
let text = text.trim();
text.contains("MathType") ||
text.contains("MTExtra") ||
text.contains("\\") ||
text.contains("{") ||
text.contains("}") ||
(text.len() > 10 && (text.contains("^") || text.contains("_")))
}
#[cfg(feature = "formula")]
fn parse_mtef_for_text(&self, _text: &str) -> Option<Vec<crate::formula::MathNode<'static>>> {
for parsed_ast in self.parsed_mtef.values() {
if !parsed_ast.is_empty() {
return Some(parsed_ast.clone());
}
}
None
}
#[cfg(not(feature = "formula"))]
fn parse_mtef_for_text(&self, _text: &str) -> Option<Vec<()>> {
None
}
pub fn text(&self) -> Result<String> {
self.text_extractor.extract_all_text()
}
pub fn paragraph_count(&self) -> Result<usize> {
Ok(self.text()?.lines().count())
}
pub fn table_count(&self) -> Result<usize> {
Ok(0)
}
#[inline]
pub fn fib(&self) -> &FileInformationBlock {
&self.fib
}
pub fn paragraphs(&self) -> Result<Vec<Paragraph>> {
let mut all_paragraphs = Vec::new();
let text = self.text()?;
let subdoc_ranges = self.fib.get_all_subdoc_ranges();
eprintln!("DEBUG: Found {} subdocument ranges", subdoc_ranges.len());
for (name, start, end) in &subdoc_ranges {
eprintln!("DEBUG: {}: CP range {}..{} ({} chars)", name, start, end, end - start);
}
for (subdoc_name, start_cp, end_cp) in subdoc_ranges {
if start_cp >= end_cp {
continue;
}
eprintln!("DEBUG: Parsing subdocument '{}' (CP {}..{})", subdoc_name, start_cp, end_cp);
let para_extractor = ParagraphExtractor::new_with_range(
&self.fib,
&self.table_stream,
&self.word_document,
text.clone(),
(start_cp, end_cp),
)?;
let extracted_paras = para_extractor.extract_paragraphs()?;
eprintln!("DEBUG: Extracted {} paragraphs from '{}'", extracted_paras.len(), subdoc_name);
self.convert_to_paragraphs(extracted_paras, &mut all_paragraphs);
}
eprintln!("DEBUG: Total paragraphs extracted: {}", all_paragraphs.len());
Ok(all_paragraphs)
}
fn convert_to_paragraphs(
&self,
extracted_paras: Vec<(String, ParagraphProperties, Vec<(String, CharacterProperties)>)>,
output: &mut Vec<Paragraph>,
) {
for (para_text, para_props, runs) in extracted_paras {
let run_objects: Vec<Run> = runs
.into_iter()
.map(|(text, props)| {
if let Some(pic_offset) = props.pic_offset {
if pic_offset > 0 {
let object_name = format!("_{}", pic_offset);
if let Some(mtef_ast) = self.parsed_mtef.get(&object_name) {
return Run::with_mtef_formula(text, props, mtef_ast.clone());
}
}
}
if props.is_ole2 && Self::is_potential_mtef_formula(&text)
&& let Some(mtef_ast) = self.parse_mtef_for_text(&text) {
return Run::with_mtef_formula(text, props, mtef_ast);
}
Run::new(text, props)
})
.collect();
let mut para = Paragraph::new(para_text);
para.set_runs(run_objects);
para.set_properties(para_props);
output.push(para);
}
}
pub fn tables(&self) -> Result<Vec<Table>> {
Ok(Vec::new())
}
}
#[cfg(test)]
mod tests {
}