use std::io::{Read, Seek};
use crate::cfb::CfbReader;
use super::error::{DocError, Result};
use super::fib::Fib;
use super::images::{DocImage, extract_images};
use super::piece_table::{extract_text, parse_clx, sanitize_text};
#[derive(Debug)]
pub struct DocDocument {
text: String,
images: Vec<DocImage>,
}
impl DocDocument {
pub fn from_reader<R: Read + Seek>(reader: R) -> Result<Self> {
let mut cfb = CfbReader::new(reader)?;
let word_doc = cfb
.open_stream("WordDocument")
.map_err(|_| DocError::MissingStream("WordDocument stream not found".into()))?;
let fib = match Fib::parse(&word_doc) {
Ok(f) => f,
Err(_) => {
return Ok(Self {
text: String::new(),
images: Vec::new(),
});
}, };
let table_stream = if fib.use_table1 {
cfb.open_stream("1Table")
.or_else(|_| cfb.open_stream("0Table"))
} else {
cfb.open_stream("0Table")
.or_else(|_| cfb.open_stream("1Table"))
};
let table_stream = match table_stream {
Ok(s) => s,
Err(_) => {
return Ok(Self {
text: String::new(),
images: Vec::new(),
});
}, };
let clx_start = fib.clx_offset as usize;
let clx_end = clx_start + fib.clx_size as usize;
if clx_start >= table_stream.len()
|| clx_size_zero_or_oob(fib.clx_size, clx_start, table_stream.len())
{
return Ok(Self {
text: String::new(),
images: Vec::new(),
});
}
let clx_end = clx_end.min(table_stream.len());
let clx_data = &table_stream[clx_start..clx_end];
let pieces = match parse_clx(clx_data) {
Ok(p) => p,
Err(_) => {
return Ok(Self {
text: String::new(),
images: Vec::new(),
});
},
};
let raw_text = extract_text(&word_doc, &pieces, fib.text_len);
let text = sanitize_text(&raw_text);
let images = match cfb.open_stream("Data") {
Ok(data_stream) => extract_images(&data_stream),
Err(_) => Vec::new(),
};
Ok(Self { text, images })
}
pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
let file = std::fs::File::open(path)?;
Self::from_reader(file)
}
pub fn images(&self) -> &[DocImage] {
&self.images
}
pub fn plain_text(&self) -> String {
self.text.clone()
}
pub fn plain_text_ref(&self) -> &str {
&self.text
}
pub fn to_markdown(&self) -> String {
let mut result = String::new();
let mut prev_empty = false;
for line in self.text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !prev_empty {
result.push('\n');
}
prev_empty = true;
} else {
result.push_str(trimmed);
result.push_str("\n\n");
prev_empty = false;
}
}
result
}
}
fn clx_size_zero_or_oob(clx_size: u32, clx_start: usize, stream_len: usize) -> bool {
clx_size == 0 || clx_start + clx_size as usize > stream_len + 1024 }
impl crate::core::OfficeDocument for DocDocument {
fn plain_text(&self) -> String {
self.plain_text()
}
fn to_markdown(&self) -> String {
self.to_markdown()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn markdown_double_spacing() {
let doc = DocDocument {
images: Vec::new(),
text: "First paragraph\nSecond paragraph\n\nAfter gap".into(),
};
let md = doc.to_markdown();
assert!(md.contains("First paragraph\n\n"));
assert!(md.contains("Second paragraph\n\n"));
assert!(md.contains("After gap\n\n"));
}
#[test]
fn plain_text_access() {
let doc = DocDocument {
images: Vec::new(),
text: "Hello World".into(),
};
assert_eq!(doc.plain_text(), "Hello World");
}
fn make_doc(text: &str) -> DocDocument {
DocDocument {
images: Vec::new(),
text: text.to_string(),
}
}
#[test]
fn ir_empty_doc_produces_empty_section() {
let ir = crate::convert_doc::doc_to_ir(&make_doc(""));
assert!(ir.sections[0].elements.is_empty());
assert!(ir.metadata.title.is_none());
}
#[test]
fn ir_allcaps_first_line_becomes_h1() {
use crate::ir::Element;
let ir = crate::convert_doc::doc_to_ir(&make_doc("INTRODUCTION\nSome text here."));
assert_eq!(ir.metadata.title.as_deref(), Some("INTRODUCTION"));
assert!(matches!(ir.sections[0].elements[0], Element::Heading(ref h) if h.level == 1));
}
#[test]
fn ir_first_short_line_no_punct_becomes_h1() {
use crate::ir::Element;
let ir = crate::convert_doc::doc_to_ir(&make_doc("My Document Title\nThis is body text."));
assert!(matches!(ir.sections[0].elements[0], Element::Heading(ref h) if h.level == 1));
}
#[test]
fn ir_allcaps_non_first_line_becomes_h2() {
use crate::ir::Element;
let ir = crate::convert_doc::doc_to_ir(&make_doc("Title\nSECTION TWO\nBody text."));
assert!(matches!(ir.sections[0].elements[1], Element::Heading(ref h) if h.level == 2));
}
#[test]
fn ir_line_ending_with_period_becomes_paragraph() {
use crate::ir::Element;
let ir = crate::convert_doc::doc_to_ir(&make_doc("This is a sentence."));
assert!(matches!(ir.sections[0].elements[0], Element::Paragraph(_)));
}
#[test]
fn ir_blank_lines_are_skipped() {
let ir = crate::convert_doc::doc_to_ir(&make_doc("Title\n\n\nText"));
assert_eq!(ir.sections[0].elements.len(), 2);
}
#[test]
fn ir_format_is_doc() {
let ir = crate::convert_doc::doc_to_ir(&make_doc("content"));
assert_eq!(ir.metadata.format, crate::format::DocumentFormat::Doc);
}
}