infiniloom_engine/document/
mod.rs1#![allow(clippy::result_large_err)]
12
13pub mod chunking;
14pub mod distillation;
15pub mod output;
16pub mod parsers;
17pub mod pii;
18pub mod types;
19
20pub use types::*;
21
22use std::path::Path;
23
24use crate::error::InfiniloomError;
25use crate::tokenizer::{TokenCounts, Tokenizer};
26
27pub fn count_document_tokens(doc: &mut Document) {
29 let tokenizer = Tokenizer::new();
30 let full_text = doc.full_text();
31 doc.token_count = tokenizer.count_all(&full_text);
32}
33
34pub fn count_output_tokens(output_text: &str) -> TokenCounts {
36 let tokenizer = Tokenizer::new();
37 tokenizer.count_all(output_text)
38}
39
40const MAX_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024;
43
44pub fn parse_document(path: &Path, options: &ParseOptions) -> Result<Document, InfiniloomError> {
46 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
47
48 let format = DocumentFormat::from_extension(ext).ok_or_else(|| {
49 InfiniloomError::not_supported(format!("Unsupported document format: .{ext}"))
50 })?;
51
52 let file_size = std::fs::metadata(path)
54 .map_err(|e| {
55 InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
56 })?
57 .len();
58 if file_size > MAX_DOCUMENT_SIZE {
59 return Err(InfiniloomError::invalid_input(format!(
60 "Document {} exceeds maximum size of {} bytes ({} bytes)",
61 path.display(),
62 MAX_DOCUMENT_SIZE,
63 file_size,
64 )));
65 }
66
67 let mut doc = if format == DocumentFormat::Docx {
69 let bytes = std::fs::read(path).map_err(|e| {
70 InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
71 })?;
72 parsers::docx::parse(&bytes, options)?
73 } else if format == DocumentFormat::Pdf {
74 #[cfg(feature = "document-pdf")]
75 {
76 let bytes = std::fs::read(path).map_err(|e| {
77 InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
78 })?;
79 parsers::pdf::parse(&bytes, options)?
80 }
81 #[cfg(not(feature = "document-pdf"))]
82 {
83 return Err(InfiniloomError::not_supported(
84 "PDF parsing requires the 'document-pdf' feature. \
85 Rebuild with: cargo build --features document-pdf"
86 .to_owned(),
87 ));
88 }
89 } else if format == DocumentFormat::Xlsx {
90 #[cfg(feature = "document-xlsx")]
91 {
92 let bytes = std::fs::read(path).map_err(|e| {
93 InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
94 })?;
95 parsers::xlsx::parse(&bytes, options)?
96 }
97 #[cfg(not(feature = "document-xlsx"))]
98 {
99 return Err(InfiniloomError::not_supported(
100 "XLSX parsing requires the 'document-xlsx' feature. \
101 Rebuild with: cargo build --features document-xlsx"
102 .to_owned(),
103 ));
104 }
105 } else {
106 let content = std::fs::read_to_string(path).map_err(|e| {
107 InfiniloomError::invalid_input(format!("Failed to read {}: {e}", path.display()))
108 })?;
109 parse_content(&content, format, options)?
110 };
111
112 doc.source = path.to_path_buf();
113
114 if doc.title.is_none() {
116 doc.title = doc.metadata.title.clone();
117 }
118 if doc.title.is_none() {
119 doc.title = doc.sections.first().and_then(|s| s.title.clone());
120 }
121
122 count_document_tokens(&mut doc);
124
125 Ok(doc)
126}
127
128pub fn parse_content(
130 content: &str,
131 format: DocumentFormat,
132 options: &ParseOptions,
133) -> Result<Document, InfiniloomError> {
134 match format {
135 DocumentFormat::Markdown => parsers::markdown::parse(content, options),
136 DocumentFormat::PlainText => parsers::plaintext::parse(content, options),
137 DocumentFormat::Html => parsers::html::parse(content, options),
138 DocumentFormat::Csv => parsers::csv::parse(content, options),
139 _ => Err(InfiniloomError::not_supported(format!(
140 "Parser not yet implemented for {}",
141 format.name()
142 ))),
143 }
144}
145
146#[derive(Debug, Clone)]
148pub struct ParseOptions {
149 pub extract_tables: bool,
151 pub max_depth: u8,
153 pub distillation: DistillationLevel,
155}
156
157impl Default for ParseOptions {
158 fn default() -> Self {
159 Self { extract_tables: true, max_depth: 6, distillation: DistillationLevel::Balanced }
160 }
161}
162
163#[cfg(test)]
164mod tests {
165 use super::*;
166
167 #[test]
168 fn test_parse_content_markdown() {
169 let content = "# Hello\n\nThis is a test.\n\n## Section 2\n\nMore text.";
170 let doc =
171 parse_content(content, DocumentFormat::Markdown, &ParseOptions::default()).unwrap();
172 assert_eq!(doc.section_count(), 2);
173 }
174
175 #[test]
176 fn test_parse_content_plaintext() {
177 let content = "INTRODUCTION\n\nSome text here.\n\nCONCLUSION\n\nFinal text.";
178 let doc =
179 parse_content(content, DocumentFormat::PlainText, &ParseOptions::default()).unwrap();
180 assert!(doc.section_count() >= 1);
181 }
182
183 #[test]
184 fn test_unsupported_format() {
185 let result = parse_content("test", DocumentFormat::Xlsx, &ParseOptions::default());
186 assert!(result.is_err());
187 }
188
189 #[test]
190 fn test_count_document_tokens_populates_nonzero() {
191 let content = "# Introduction\n\nThis is a document with enough text to generate tokens.\n\n## Details\n\nMore detailed content goes here with several words.";
192 let mut doc =
193 parse_content(content, DocumentFormat::Markdown, &ParseOptions::default()).unwrap();
194 assert_eq!(doc.token_count.claude, 0);
196
197 count_document_tokens(&mut doc);
198
199 assert!(doc.token_count.claude > 0, "Claude tokens should be non-zero");
200 assert!(doc.token_count.o200k > 0, "o200k tokens should be non-zero");
201 assert!(doc.token_count.gemini > 0, "Gemini tokens should be non-zero");
202 }
203
204 #[test]
205 fn test_count_output_tokens_returns_reasonable_counts() {
206 let text = "This is a sample formatted output with several words and sentences for token counting.";
207 let counts = count_output_tokens(text);
208
209 assert!(counts.claude > 0, "Claude tokens should be non-zero");
210 assert!(counts.o200k > 0, "o200k tokens should be non-zero");
211 assert!(counts.gemini > 0, "Gemini tokens should be non-zero");
212 assert!(counts.claude < 100, "Token count should be reasonable");
214 }
215}