Skip to main content

infiniloom_engine/document/
mod.rs

1//! Document ingestion module for converting human-readable documents into
2//! LLM-optimized structured formats.
3//!
4//! This module provides:
5//! - **Type system**: `Document`, `Section`, `ContentBlock` for representing document structure
6//! - **Parsers**: Format-specific parsers (Markdown, HTML, plain text, CSV, DOCX, PDF)
7//! - **Distillation**: Content compression pipeline that removes filler and optimizes for LLM attention
8//! - **Output**: Document-specific formatters for Claude (XML), GPT (Markdown), agents (JSON)
9
10// InfiniloomError is the project-wide error type; its size is not our concern here.
11#![allow(clippy::result_large_err)]
12
13pub mod chunking;
14pub mod distillation;
15pub mod output;
16pub mod parsers;
17pub mod pii;
18pub mod types;
19
20pub use types::*;
21
22use std::path::Path;
23
24use crate::error::InfiniloomError;
25use crate::tokenizer::{TokenCounts, Tokenizer};
26
27/// Count tokens for a document's full text content across all model families.
28pub fn count_document_tokens(doc: &mut Document) {
29    let tokenizer = Tokenizer::new();
30    let full_text = doc.full_text();
31    doc.token_count = tokenizer.count_all(&full_text);
32}
33
34/// Count tokens for formatted output text across all model families.
35pub fn count_output_tokens(output_text: &str) -> TokenCounts {
36    let tokenizer = Tokenizer::new();
37    tokenizer.count_all(output_text)
38}
39
40/// Maximum document file size in bytes (100 MB).
41/// Prevents unbounded memory allocation from extremely large files.
42const MAX_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024;
43
44/// Parse a document from a file path, auto-detecting the format.
45pub fn parse_document(path: &Path, options: &ParseOptions) -> Result<Document, InfiniloomError> {
46    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
47
48    let format = DocumentFormat::from_extension(ext).ok_or_else(|| {
49        InfiniloomError::not_supported(format!("Unsupported document format: .{ext}"))
50    })?;
51
52    // Check file size before reading to prevent unbounded memory allocation.
53    let file_size = std::fs::metadata(path)
54        .map_err(|e| {
55            InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
56        })?
57        .len();
58    if file_size > MAX_DOCUMENT_SIZE {
59        return Err(InfiniloomError::invalid_input(format!(
60            "Document {} exceeds maximum size of {} bytes ({} bytes)",
61            path.display(),
62            MAX_DOCUMENT_SIZE,
63            file_size,
64        )));
65    }
66
67    // DOCX, PDF, and XLSX are binary formats — read as bytes, not as a UTF-8 string.
68    let mut doc = if format == DocumentFormat::Docx {
69        let bytes = std::fs::read(path).map_err(|e| {
70            InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
71        })?;
72        parsers::docx::parse(&bytes, options)?
73    } else if format == DocumentFormat::Pdf {
74        #[cfg(feature = "document-pdf")]
75        {
76            let bytes = std::fs::read(path).map_err(|e| {
77                InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
78            })?;
79            parsers::pdf::parse(&bytes, options)?
80        }
81        #[cfg(not(feature = "document-pdf"))]
82        {
83            return Err(InfiniloomError::not_supported(
84                "PDF parsing requires the 'document-pdf' feature. \
85                 Rebuild with: cargo build --features document-pdf"
86                    .to_owned(),
87            ));
88        }
89    } else if format == DocumentFormat::Xlsx {
90        #[cfg(feature = "document-xlsx")]
91        {
92            let bytes = std::fs::read(path).map_err(|e| {
93                InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
94            })?;
95            parsers::xlsx::parse(&bytes, options)?
96        }
97        #[cfg(not(feature = "document-xlsx"))]
98        {
99            return Err(InfiniloomError::not_supported(
100                "XLSX parsing requires the 'document-xlsx' feature. \
101                 Rebuild with: cargo build --features document-xlsx"
102                    .to_owned(),
103            ));
104        }
105    } else {
106        let content = std::fs::read_to_string(path).map_err(|e| {
107            InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
108        })?;
109        parse_content(&content, format, options)?
110    };
111
112    doc.source = path.to_path_buf();
113
114    // Extract title from metadata or first heading
115    if doc.title.is_none() {
116        doc.title = doc.metadata.title.clone();
117    }
118    if doc.title.is_none() {
119        doc.title = doc.sections.first().and_then(|s| s.title.clone());
120    }
121
122    // Populate token counts for the parsed document
123    count_document_tokens(&mut doc);
124
125    Ok(doc)
126}
127
128/// Parse document content from a string with a known format.
129pub fn parse_content(
130    content: &str,
131    format: DocumentFormat,
132    options: &ParseOptions,
133) -> Result<Document, InfiniloomError> {
134    match format {
135        DocumentFormat::Markdown => parsers::markdown::parse(content, options),
136        DocumentFormat::PlainText => parsers::plaintext::parse(content, options),
137        DocumentFormat::Html => parsers::html::parse(content, options),
138        DocumentFormat::Csv => parsers::csv::parse(content, options),
139        _ => Err(InfiniloomError::not_supported(format!(
140            "Parser not yet implemented for {}",
141            format.name()
142        ))),
143    }
144}
145
146/// Options for document parsing.
147#[derive(Debug, Clone)]
148pub struct ParseOptions {
149    /// Extract tables from content
150    pub extract_tables: bool,
151    /// Maximum heading depth to track
152    pub max_depth: u8,
153    /// Distillation level to apply after parsing
154    pub distillation: DistillationLevel,
155}
156
157impl Default for ParseOptions {
158    fn default() -> Self {
159        Self { extract_tables: true, max_depth: 6, distillation: DistillationLevel::Balanced }
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn test_parse_content_markdown() {
169        let content = "# Hello\n\nThis is a test.\n\n## Section 2\n\nMore text.";
170        let doc =
171            parse_content(content, DocumentFormat::Markdown, &ParseOptions::default()).unwrap();
172        assert_eq!(doc.section_count(), 2);
173    }
174
175    #[test]
176    fn test_parse_content_plaintext() {
177        let content = "INTRODUCTION\n\nSome text here.\n\nCONCLUSION\n\nFinal text.";
178        let doc =
179            parse_content(content, DocumentFormat::PlainText, &ParseOptions::default()).unwrap();
180        assert!(doc.section_count() >= 1);
181    }
182
183    #[test]
184    fn test_unsupported_format() {
185        let result = parse_content("test", DocumentFormat::Xlsx, &ParseOptions::default());
186        assert!(result.is_err());
187    }
188
189    #[test]
190    fn test_count_document_tokens_populates_nonzero() {
191        let content = "# Introduction\n\nThis is a document with enough text to generate tokens.\n\n## Details\n\nMore detailed content goes here with several words.";
192        let mut doc =
193            parse_content(content, DocumentFormat::Markdown, &ParseOptions::default()).unwrap();
194        // Token counts start at zero from parse_content (only parse_document calls counting)
195        assert_eq!(doc.token_count.claude, 0);
196
197        count_document_tokens(&mut doc);
198
199        assert!(doc.token_count.claude > 0, "Claude tokens should be non-zero");
200        assert!(doc.token_count.o200k > 0, "o200k tokens should be non-zero");
201        assert!(doc.token_count.gemini > 0, "Gemini tokens should be non-zero");
202    }
203
204    #[test]
205    fn test_count_output_tokens_returns_reasonable_counts() {
206        let text = "This is a sample formatted output with several words and sentences for token counting.";
207        let counts = count_output_tokens(text);
208
209        assert!(counts.claude > 0, "Claude tokens should be non-zero");
210        assert!(counts.o200k > 0, "o200k tokens should be non-zero");
211        assert!(counts.gemini > 0, "Gemini tokens should be non-zero");
212        // Sanity check: a ~90 character string should not produce thousands of tokens
213        assert!(counts.claude < 100, "Token count should be reasonable");
214    }
215}