pub mod drawing;
pub mod parser;
pub mod section;
pub mod styles;
pub mod table;
pub mod theme;
use crate::error::{KreuzbergError, Result};
use crate::extraction::capacity;
use crate::types::PageBoundary;
use std::io::Cursor;
pub const MAX_UNCOMPRESSED_FILE_SIZE: u64 = 100 * 1024 * 1024;
pub const MAX_ZIP_ENTRIES: usize = 10_000;
pub const MAX_TOTAL_UNCOMPRESSED_SIZE: u64 = 500 * 1024 * 1024;
pub const MAX_IMAGE_FILE_SIZE: u64 = 100 * 1024 * 1024;
pub const EMUS_PER_INCH: i64 = 914_400;
pub const EMUS_PER_PIXEL_96DPI: i64 = 9_525;
pub fn extract_text(bytes: &[u8]) -> Result<String> {
parser::extract_text_from_bytes(bytes)
}
pub fn extract_text_with_page_breaks(bytes: &[u8]) -> Result<(String, Option<Vec<PageBoundary>>)> {
let text = extract_text(bytes)?;
let page_breaks = detect_page_breaks(bytes)?;
if page_breaks.is_empty() {
return Ok((text, None));
}
let boundaries = map_page_breaks_to_boundaries(&text, page_breaks)?;
Ok((text, Some(boundaries)))
}
pub fn detect_page_breaks_from_docx(bytes: &[u8]) -> Result<Option<Vec<PageBoundary>>> {
match extract_text_with_page_breaks(bytes) {
Ok((_, boundaries)) => Ok(boundaries),
Err(e) => {
tracing::debug!("Page break detection failed: {}", e);
Ok(None)
}
}
}
fn detect_page_breaks(bytes: &[u8]) -> Result<Vec<usize>> {
use zip::ZipArchive;
let cursor = Cursor::new(bytes);
let mut archive =
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to open DOCX as ZIP: {}", e)))?;
let document_xml = match archive.by_name("word/document.xml") {
Ok(mut file) => {
let file_size = file.size();
let estimated_size = capacity::estimate_content_capacity(file_size, "docx").max(file_size as usize);
let mut content = String::with_capacity(estimated_size);
std::io::Read::read_to_string(&mut file, &mut content)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read document.xml: {}", e)))?;
content
}
Err(_) => return Ok(Vec::new()),
};
let mut breaks = Vec::with_capacity(16);
let search_pattern = r#"<w:br w:type="page"/>"#;
for (idx, _) in document_xml.match_indices(search_pattern) {
breaks.push(idx);
}
Ok(breaks)
}
fn map_page_breaks_to_boundaries(text: &str, page_breaks: Vec<usize>) -> Result<Vec<PageBoundary>> {
if page_breaks.is_empty() {
return Ok(Vec::new());
}
let page_count = page_breaks.len() + 1;
let char_count = text.chars().count();
let chars_per_page = char_count / page_count;
let mut boundaries = Vec::with_capacity(page_count);
let mut current_byte = 0;
let mut current_char = 0;
for page_num in 1..=page_count {
let start_byte = current_byte;
let end_byte = if page_num == page_count {
text.len()
} else {
let target_char = (page_num * chars_per_page).min(char_count);
for ch in text[current_byte..].chars() {
if current_char >= target_char {
break;
}
current_byte += ch.len_utf8();
current_char += 1;
}
current_byte
};
boundaries.push(PageBoundary {
byte_start: start_byte,
byte_end: end_byte,
page_number: page_num,
});
current_byte = end_byte;
}
Ok(boundaries)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_text_empty() {
let result = extract_text(b"");
assert!(result.is_err());
}
#[test]
fn test_extract_text_invalid() {
let result = extract_text(b"not a docx file");
assert!(result.is_err());
}
#[test]
fn test_map_page_breaks_to_boundaries_empty() {
let result = map_page_breaks_to_boundaries("test text", Vec::new()).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_map_page_breaks_to_boundaries_single_break() {
let text = "Page 1 content here with some text.Page 2 content here with more text.";
let breaks = vec![0];
let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].page_number, 1);
assert_eq!(result[0].byte_start, 0);
assert!(result[0].byte_end > 0);
assert!(result[0].byte_end < text.len());
assert_eq!(result[1].page_number, 2);
assert_eq!(result[1].byte_start, result[0].byte_end);
assert_eq!(result[1].byte_end, text.len());
}
#[test]
fn test_map_page_breaks_to_boundaries_multiple_breaks() {
let text = "A".repeat(300);
let breaks = vec![0, 0, 0];
let result = map_page_breaks_to_boundaries(&text, breaks).unwrap();
assert_eq!(result.len(), 4);
assert_eq!(result[0].page_number, 1);
assert_eq!(result[3].page_number, 4);
assert_eq!(result[3].byte_end, text.len());
for i in 0..result.len() - 1 {
assert_eq!(result[i].byte_end, result[i + 1].byte_start);
}
}
#[test]
fn test_map_page_breaks_to_boundaries_utf8_boundary() {
let text = "Hello world! こんにちは世界! More text here.";
let breaks = vec![0];
let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
assert_eq!(result.len(), 2);
assert!(text.is_char_boundary(result[0].byte_start));
assert!(text.is_char_boundary(result[0].byte_end));
assert!(text.is_char_boundary(result[1].byte_start));
assert!(text.is_char_boundary(result[1].byte_end));
}
#[test]
fn test_docx_page_breaks_with_emoji() {
let text = "Hello 😀 World 🌍 Foo 🎉 Bar";
let breaks = vec![0, 0];
let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].page_number, 1);
assert_eq!(result[1].page_number, 2);
assert_eq!(result[2].page_number, 3);
for boundary in &result {
assert!(
text.is_char_boundary(boundary.byte_start),
"byte_start {} is not a valid UTF-8 boundary",
boundary.byte_start
);
assert!(
text.is_char_boundary(boundary.byte_end),
"byte_end {} is not a valid UTF-8 boundary",
boundary.byte_end
);
}
assert_eq!(result[0].byte_start, 0);
assert_eq!(result[0].byte_end, result[1].byte_start);
assert_eq!(result[1].byte_end, result[2].byte_start);
assert_eq!(result[2].byte_end, text.len());
let reconstructed = format!(
"{}{}{}",
&text[result[0].byte_start..result[0].byte_end],
&text[result[1].byte_start..result[1].byte_end],
&text[result[2].byte_start..result[2].byte_end]
);
assert_eq!(reconstructed, text);
}
#[test]
fn test_docx_page_breaks_with_cjk() {
let text = "你好世界你好世界你好世界你好世界";
let breaks = vec![0];
let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].page_number, 1);
assert_eq!(result[1].page_number, 2);
for boundary in &result {
assert!(
text.is_char_boundary(boundary.byte_start),
"byte_start {} is not a valid UTF-8 boundary",
boundary.byte_start
);
assert!(
text.is_char_boundary(boundary.byte_end),
"byte_end {} is not a valid UTF-8 boundary",
boundary.byte_end
);
}
assert_eq!(result[0].byte_start, 0);
assert_eq!(result[0].byte_end, result[1].byte_start);
assert_eq!(result[1].byte_end, text.len());
let reconstructed = format!(
"{}{}",
&text[result[0].byte_start..result[0].byte_end],
&text[result[1].byte_start..result[1].byte_end]
);
assert_eq!(reconstructed, text);
}
#[test]
fn test_docx_page_breaks_multibyte_utf8() {
let text = "ASCII 😀 中文 hello 🎉 world 日本語";
let breaks = vec![0, 0];
let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
assert_eq!(result.len(), 3);
for boundary in &result {
assert!(
text.is_char_boundary(boundary.byte_start),
"byte_start {} is not a valid UTF-8 boundary",
boundary.byte_start
);
assert!(
text.is_char_boundary(boundary.byte_end),
"byte_end {} is not a valid UTF-8 boundary",
boundary.byte_end
);
}
assert_eq!(result[0].byte_start, 0);
for i in 0..result.len() - 1 {
assert_eq!(
result[i].byte_end,
result[i + 1].byte_start,
"Gap or overlap between page {} and {}",
i + 1,
i + 2
);
}
assert_eq!(
result[result.len() - 1].byte_end,
text.len(),
"Last page does not end at text boundary"
);
let mut reconstructed = String::new();
for boundary in &result {
reconstructed.push_str(&text[boundary.byte_start..boundary.byte_end]);
}
assert_eq!(reconstructed, text);
}
#[test]
fn test_detect_page_breaks_no_feature() {
let result = detect_page_breaks(b"invalid");
assert!(result.is_err());
}
#[test]
fn test_extract_text_with_page_breaks_no_breaks() {
let docx_path =
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/docx/lorem_ipsum.docx");
if let Ok(bytes) = std::fs::read(docx_path) {
let result = extract_text_with_page_breaks(&bytes);
if let Ok((text, boundaries)) = result {
assert!(!text.is_empty());
if let Some(b) = boundaries {
assert!(!b.is_empty());
}
}
}
}
}