oxidize_pdf/ai/chunking.rs
1//! Document chunking for RAG (Retrieval Augmented Generation)
2//!
3//! This module provides functionality to split PDF documents into smaller chunks
4//! suitable for processing with Large Language Models (LLMs). LLMs have token limits,
5//! so long documents need to be split into manageable pieces while preserving context.
6//!
7//! # Example
8//!
9//! ```no_run
10//! use oxidize_pdf::ai::DocumentChunker;
11//! use oxidize_pdf::parser::{PdfReader, PdfDocument};
12//!
13//! # fn main() -> oxidize_pdf::Result<()> {
14//! let reader = PdfReader::open("large_document.pdf")?;
15//! let pdf_doc = PdfDocument::new(reader);
16//! let text_pages = pdf_doc.extract_text()?;
17//!
18//! let chunker = DocumentChunker::new(512, 50); // 512 tokens, 50 overlap
19//! let page_texts: Vec<(usize, String)> = text_pages.iter()
20//! .enumerate()
21//! .map(|(idx, page)| (idx + 1, page.text.clone()))
22//! .collect();
23//! let chunks = chunker.chunk_text_with_pages(&page_texts)?;
24//!
25//! println!("Created {} chunks", chunks.len());
26//! for chunk in &chunks {
27//! println!("Chunk {}: {} tokens", chunk.id, chunk.tokens);
28//! }
29//! # Ok(())
30//! # }
31//! ```
32
33use crate::{Document, Result};
34
35/// A chunk of a PDF document suitable for LLM processing
36///
37/// Each chunk represents a portion of the document's text with associated metadata
38/// that helps maintain context during retrieval and generation.
39#[derive(Debug, Clone)]
40pub struct DocumentChunk {
41 /// Unique identifier for this chunk (e.g., "chunk_0", "chunk_1")
42 pub id: String,
43
44 /// The text content of this chunk
45 pub content: String,
46
47 /// Estimated number of tokens in this chunk
48 pub tokens: usize,
49
50 /// Page numbers where this chunk's content appears (1-indexed)
51 pub page_numbers: Vec<usize>,
52
53 /// Index of this chunk in the sequence (0-indexed)
54 pub chunk_index: usize,
55
56 /// Additional metadata for this chunk
57 pub metadata: ChunkMetadata,
58}
59
60/// Metadata for a document chunk
61#[derive(Debug, Clone, Default)]
62pub struct ChunkMetadata {
63 /// Position information about where this chunk appears in the document
64 pub position: ChunkPosition,
65
66 /// Confidence score for text extraction quality (0.0-1.0)
67 /// 1.0 = high confidence, 0.0 = low confidence
68 pub confidence: f32,
69
70 /// Whether this chunk respects sentence boundaries
71 pub sentence_boundary_respected: bool,
72}
73
74/// Position information for a chunk within the document
75#[derive(Debug, Clone, Default)]
76pub struct ChunkPosition {
77 /// Character offset where this chunk starts in the full document text
78 pub start_char: usize,
79
80 /// Character offset where this chunk ends in the full document text
81 pub end_char: usize,
82
83 /// First page number where this chunk appears (1-indexed)
84 pub first_page: usize,
85
86 /// Last page number where this chunk appears (1-indexed)
87 pub last_page: usize,
88}
89
90/// Configurable document chunker for splitting PDFs into LLM-friendly pieces
91///
92/// The chunker uses a simple fixed-size strategy with overlap to ensure context
93/// is preserved between consecutive chunks.
94///
95/// # Example
96///
97/// ```no_run
98/// use oxidize_pdf::ai::DocumentChunker;
99///
100/// // Create a chunker with 512 token chunks and 50 token overlap
101/// let chunker = DocumentChunker::new(512, 50);
102/// ```
103#[derive(Debug, Clone)]
104pub struct DocumentChunker {
105 /// Target size for each chunk in tokens
106 chunk_size: usize,
107
108 /// Number of tokens to overlap between consecutive chunks
109 overlap: usize,
110}
111
112impl DocumentChunker {
113 /// Create a new document chunker with specified chunk size and overlap
114 ///
115 /// # Arguments
116 ///
117 /// * `chunk_size` - Target number of tokens per chunk (typical: 256-1024)
118 /// * `overlap` - Number of tokens to overlap between chunks (typical: 10-100)
119 ///
120 /// # Example
121 ///
122 /// ```
123 /// use oxidize_pdf::ai::DocumentChunker;
124 ///
125 /// // For GPT-3.5/4 with 4K context, use smaller chunks
126 /// let chunker = DocumentChunker::new(512, 50);
127 ///
128 /// // For Claude with 100K context, you can use larger chunks
129 /// let chunker_large = DocumentChunker::new(2048, 200);
130 /// ```
131 pub fn new(chunk_size: usize, overlap: usize) -> Self {
132 Self {
133 chunk_size,
134 overlap,
135 }
136 }
137
138 /// Create a default chunker with sensible defaults for most LLMs
139 ///
140 /// Uses 512 token chunks with 50 token overlap, which works well with
141 /// GPT-3.5, GPT-4, and similar models.
142 pub fn default() -> Self {
143 Self::new(512, 50)
144 }
145
146 /// Chunk a PDF document into pieces suitable for LLM processing
147 ///
148 /// # Arguments
149 ///
150 /// * `doc` - The PDF document to chunk
151 ///
152 /// # Returns
153 ///
154 /// A vector of `DocumentChunk` objects, each containing a portion of the document.
155 ///
156 /// # Example
157 ///
158 /// ```no_run
159 /// use oxidize_pdf::{Document, ai::DocumentChunker};
160 ///
161 /// # fn main() -> oxidize_pdf::Result<()> {
162 /// let doc = Document::new();
163 /// // Add pages to doc...
164 /// let chunker = DocumentChunker::new(512, 50);
165 /// let chunks = chunker.chunk_document(&doc)?;
166 ///
167 /// for chunk in chunks {
168 /// println!("Processing chunk {}: {} tokens", chunk.id, chunk.tokens);
169 /// // Send to LLM for processing...
170 /// }
171 /// # Ok(())
172 /// # }
173 /// ```
174 pub fn chunk_document(&self, doc: &Document) -> Result<Vec<DocumentChunk>> {
175 // Extract all text from the document
176 let full_text = doc.extract_text()?;
177
178 // Chunk the text
179 self.chunk_text(&full_text)
180 }
181
182 /// Chunk a text string into fixed-size pieces with overlap
183 ///
184 /// This is the core chunking algorithm that:
185 /// 1. Tokenizes the text (simple whitespace split)
186 /// 2. Creates chunks of `chunk_size` tokens
187 /// 3. Applies `overlap` tokens between consecutive chunks
188 /// 4. Respects sentence boundaries when possible
189 ///
190 /// # Arguments
191 ///
192 /// * `text` - The text to chunk
193 ///
194 /// # Returns
195 ///
196 /// A vector of `DocumentChunk` objects
197 ///
198 /// # Example
199 ///
200 /// ```
201 /// use oxidize_pdf::ai::DocumentChunker;
202 ///
203 /// let chunker = DocumentChunker::new(10, 2);
204 /// let text = "This is some sample text that will be chunked into smaller pieces";
205 /// let chunks = chunker.chunk_text(text).unwrap();
206 /// println!("Created {} chunks", chunks.len());
207 /// ```
208 pub fn chunk_text(&self, text: &str) -> Result<Vec<DocumentChunk>> {
209 // For simple text chunking, we don't have page information
210 self.chunk_text_internal(text, &[], 0)
211 }
212
213 /// Chunk text with page information for accurate page tracking
214 ///
215 /// # Arguments
216 ///
217 /// * `page_texts` - Vector of (page_number, text) tuples (1-indexed page numbers)
218 ///
219 /// # Returns
220 ///
221 /// A vector of `DocumentChunk` objects with page tracking
222 pub fn chunk_text_with_pages(
223 &self,
224 page_texts: &[(usize, String)],
225 ) -> Result<Vec<DocumentChunk>> {
226 // Combine all page texts with page markers
227 let mut full_text = String::new();
228 let mut page_boundaries = vec![0]; // Character positions where pages start
229
230 for (_page_num, text) in page_texts {
231 if !full_text.is_empty() {
232 full_text.push_str("\n\n"); // Page separator
233 }
234 full_text.push_str(text);
235 page_boundaries.push(full_text.len());
236 }
237
238 let page_numbers: Vec<usize> = page_texts.iter().map(|(num, _)| *num).collect();
239
240 self.chunk_text_internal(&full_text, &page_boundaries, page_numbers[0])
241 }
242
243 /// Internal chunking implementation with page tracking
244 fn chunk_text_internal(
245 &self,
246 text: &str,
247 page_boundaries: &[usize],
248 first_page: usize,
249 ) -> Result<Vec<DocumentChunk>> {
250 if text.is_empty() {
251 return Ok(Vec::new());
252 }
253
254 // Tokenize: simple whitespace split for now
255 // Enhancement: Use proper tokenizer (tiktoken) for accurate token counts
256 // Priority: MEDIUM - Current whitespace split provides estimates
257 // Accurate tokenization would require tiktoken-rs external dependency
258 // Target: v1.7.0 for LLM integration improvements
259 let tokens: Vec<&str> = text.split_whitespace().collect();
260
261 if tokens.is_empty() {
262 return Ok(Vec::new());
263 }
264
265 let mut chunks = Vec::new();
266 let mut start = 0;
267 let mut chunk_idx = 0;
268 let mut char_offset = 0;
269
270 while start < tokens.len() {
271 // Calculate end position for this chunk
272 let mut end = (start + self.chunk_size).min(tokens.len());
273
274 // Try to respect sentence boundaries
275 let sentence_boundary_respected = if end < tokens.len() && end > start {
276 // Look for sentence endings in the last few tokens
277 let search_window = (end.saturating_sub(10)..end).rev();
278 let mut found_boundary = false;
279
280 for i in search_window {
281 let token = tokens[i];
282 if token.ends_with('.') || token.ends_with('!') || token.ends_with('?') {
283 end = i + 1; // Include the sentence-ending token
284 found_boundary = true;
285 break;
286 }
287 }
288 found_boundary
289 } else {
290 false
291 };
292
293 // Extract chunk tokens
294 let chunk_tokens = &tokens[start..end];
295
296 // Join tokens back into text
297 let content = chunk_tokens.join(" ");
298
299 // Calculate character positions
300 let start_char = char_offset;
301 let end_char = char_offset + content.len();
302 char_offset = end_char;
303
304 // Determine page numbers for this chunk
305 let (page_nums, first_pg, last_pg) = if page_boundaries.is_empty() {
306 (Vec::new(), 0, 0)
307 } else {
308 let mut pages = Vec::new();
309 let mut first = first_page;
310 let mut last = first_page;
311
312 for (idx, &boundary) in page_boundaries.iter().enumerate().skip(1) {
313 if start_char < boundary && end_char > page_boundaries[idx - 1] {
314 let page_num = first_page + idx - 1;
315 pages.push(page_num);
316 if pages.len() == 1 {
317 first = page_num;
318 }
319 last = page_num;
320 }
321 }
322
323 if pages.is_empty() {
324 // Chunk is beyond all tracked pages
325 pages.push(first_page);
326 first = first_page;
327 last = first_page;
328 }
329
330 (pages, first, last)
331 };
332
333 // Create chunk
334 let chunk = DocumentChunk {
335 id: format!("chunk_{}", chunk_idx),
336 content,
337 tokens: chunk_tokens.len(),
338 page_numbers: page_nums.clone(),
339 chunk_index: chunk_idx,
340 metadata: ChunkMetadata {
341 position: ChunkPosition {
342 start_char,
343 end_char,
344 first_page: first_pg,
345 last_page: last_pg,
346 },
347 confidence: 1.0, // Default high confidence for text-based chunking
348 sentence_boundary_respected,
349 },
350 };
351
352 chunks.push(chunk);
353 chunk_idx += 1;
354
355 // Move start position with overlap
356 if end < tokens.len() {
357 // Not at end yet, apply overlap
358 start = end.saturating_sub(self.overlap);
359
360 // Ensure we make progress (avoid infinite loop)
361 if start + self.chunk_size <= end {
362 start = end;
363 }
364 } else {
365 // Reached the end
366 break;
367 }
368 }
369
370 Ok(chunks)
371 }
372
373 /// Estimate the number of tokens in a text string
374 ///
375 /// Uses a simple approximation: 1 token ≈ 0.75 words (or ~1.33 tokens per word).
376 /// This is reasonably accurate for English text with GPT models.
377 ///
378 /// # Arguments
379 ///
380 /// * `text` - The text to estimate tokens for
381 ///
382 /// # Returns
383 ///
384 /// Estimated number of tokens
385 ///
386 /// # Note
387 ///
388 /// This is an approximation. For exact token counts, integrate with
389 /// a proper tokenizer like tiktoken.
390 pub fn estimate_tokens(text: &str) -> usize {
391 // Simple approximation: count words
392 // 1 token ≈ 0.75 words for English text
393 let words = text.split_whitespace().count();
394 ((words as f32) * 1.33) as usize
395 }
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
403 fn test_basic_chunking() {
404 let chunker = DocumentChunker::new(10, 2);
405
406 // Create text with exactly 25 words
407 let text = (0..25)
408 .map(|i| format!("word{}", i))
409 .collect::<Vec<_>>()
410 .join(" ");
411
412 let chunks = chunker.chunk_text(&text).unwrap();
413
414 // Should create 3 chunks:
415 // Chunk 0: words 0-9 (10 tokens)
416 // Chunk 1: words 8-17 (10 tokens, overlap of 2)
417 // Chunk 2: words 16-24 (9 tokens, overlap of 2)
418 assert_eq!(chunks.len(), 3, "Should create 3 chunks");
419
420 // Check first chunk
421 assert_eq!(chunks[0].tokens, 10);
422 assert_eq!(chunks[0].chunk_index, 0);
423 assert_eq!(chunks[0].id, "chunk_0");
424 assert_eq!(chunks[0].metadata.position.start_char, 0);
425
426 // Check second chunk
427 assert_eq!(chunks[1].tokens, 10);
428 assert_eq!(chunks[1].chunk_index, 1);
429
430 // Check third chunk
431 assert_eq!(chunks[2].tokens, 9);
432 assert_eq!(chunks[2].chunk_index, 2);
433 }
434
435 #[test]
436 fn test_overlap_preserves_context() {
437 let chunker = DocumentChunker::new(5, 2);
438
439 // Text: "a b c d e f g h i j"
440 let text = "a b c d e f g h i j";
441
442 let chunks = chunker.chunk_text(&text).unwrap();
443
444 // Chunk 0: a b c d e (positions 0-4)
445 // Chunk 1: d e f g h (positions 3-7, overlap of 2: d e)
446 // Chunk 2: g h i j (positions 6-9, overlap of 2: g h)
447
448 // Check overlap between chunk 0 and 1
449 let chunk0_end = chunks[0]
450 .content
451 .split_whitespace()
452 .rev()
453 .take(2)
454 .collect::<Vec<_>>();
455 let chunk1_start = chunks[1]
456 .content
457 .split_whitespace()
458 .take(2)
459 .collect::<Vec<_>>();
460
461 assert_eq!(chunk0_end, vec!["e", "d"]);
462 assert_eq!(chunk1_start, vec!["d", "e"]);
463 }
464
465 #[test]
466 fn test_empty_text() {
467 let chunker = DocumentChunker::new(10, 2);
468 let chunks = chunker.chunk_text("").unwrap();
469 assert_eq!(chunks.len(), 0);
470 }
471
472 #[test]
473 fn test_text_smaller_than_chunk_size() {
474 let chunker = DocumentChunker::new(100, 10);
475 let text = "just a few words";
476
477 let chunks = chunker.chunk_text(&text).unwrap();
478
479 assert_eq!(chunks.len(), 1);
480 assert_eq!(chunks[0].tokens, 4);
481 }
482
483 #[test]
484 fn test_token_estimation() {
485 // "hello world" = 2 words ≈ 2.66 tokens
486 let tokens = DocumentChunker::estimate_tokens("hello world");
487 assert!(
488 tokens >= 2 && tokens <= 3,
489 "Expected ~2-3 tokens, got {}",
490 tokens
491 );
492
493 // Empty text
494 assert_eq!(DocumentChunker::estimate_tokens(""), 0);
495
496 // Longer text: 100 words ≈ 133 tokens
497 let long_text = (0..100)
498 .map(|i| format!("word{}", i))
499 .collect::<Vec<_>>()
500 .join(" ");
501 let tokens_long = DocumentChunker::estimate_tokens(&long_text);
502 assert!(
503 tokens_long >= 120 && tokens_long <= 140,
504 "Expected ~133 tokens, got {}",
505 tokens_long
506 );
507 }
508
509 #[test]
510 fn test_chunk_ids_are_unique() {
511 let chunker = DocumentChunker::new(5, 1);
512 let text = (0..20)
513 .map(|i| format!("word{}", i))
514 .collect::<Vec<_>>()
515 .join(" ");
516
517 let chunks = chunker.chunk_text(&text).unwrap();
518
519 let ids: Vec<String> = chunks.iter().map(|c| c.id.clone()).collect();
520 let unique_ids: std::collections::HashSet<_> = ids.iter().collect();
521
522 assert_eq!(
523 ids.len(),
524 unique_ids.len(),
525 "All chunk IDs should be unique"
526 );
527 }
528
529 #[test]
530 fn test_sentence_boundary_detection() {
531 let chunker = DocumentChunker::new(10, 2);
532
533 let text = "This is the first sentence. This is the second sentence. This is the third sentence. And here is a fourth one.";
534
535 let chunks = chunker.chunk_text(&text).unwrap();
536
537 // At least some chunks should respect sentence boundaries
538 let has_boundary_respect = chunks
539 .iter()
540 .any(|c| c.metadata.sentence_boundary_respected);
541 assert!(
542 has_boundary_respect,
543 "At least some chunks should respect sentence boundaries"
544 );
545
546 // Check that sentences aren't broken in the middle (chunks should end with punctuation or be the last chunk)
547 for (i, chunk) in chunks.iter().enumerate() {
548 if i < chunks.len() - 1 && chunk.metadata.sentence_boundary_respected {
549 assert!(
550 chunk.content.ends_with('.')
551 || chunk.content.ends_with('!')
552 || chunk.content.ends_with('?'),
553 "Chunk {} should end with sentence punctuation",
554 i
555 );
556 }
557 }
558 }
559
560 #[test]
561 fn test_page_tracking() {
562 let chunker = DocumentChunker::new(10, 2);
563
564 let page_texts = vec![
565 (1, "This is page one content.".to_string()),
566 (2, "This is page two content.".to_string()),
567 (3, "This is page three content.".to_string()),
568 ];
569
570 let chunks = chunker.chunk_text_with_pages(&page_texts).unwrap();
571
572 // All chunks should have page information
573 for chunk in &chunks {
574 assert!(
575 !chunk.page_numbers.is_empty(),
576 "Chunk should have page numbers"
577 );
578 assert!(
579 chunk.metadata.position.first_page > 0,
580 "First page should be > 0"
581 );
582 assert!(
583 chunk.metadata.position.last_page > 0,
584 "Last page should be > 0"
585 );
586 }
587
588 // First chunk should start at page 1
589 assert_eq!(
590 chunks[0].metadata.position.first_page, 1,
591 "First chunk should start at page 1"
592 );
593 }
594
595 #[test]
596 fn test_metadata_position_tracking() {
597 let chunker = DocumentChunker::new(5, 1);
598
599 let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
600
601 let chunks = chunker.chunk_text(&text).unwrap();
602
603 // Check that positions are sequential and non-overlapping in character space
604 for i in 0..chunks.len() - 1 {
605 assert!(
606 chunks[i].metadata.position.end_char
607 <= chunks[i + 1].metadata.position.start_char + 10,
608 "Chunks should have reasonable character positions"
609 );
610 }
611
612 // First chunk should start at position 0
613 assert_eq!(chunks[0].metadata.position.start_char, 0);
614
615 // Each chunk should have a meaningful character range
616 for chunk in &chunks {
617 assert!(
618 chunk.metadata.position.end_char > chunk.metadata.position.start_char,
619 "End char should be greater than start char"
620 );
621 }
622 }
623
624 #[test]
625 fn test_confidence_scores() {
626 let chunker = DocumentChunker::new(10, 2);
627
628 let text = "This is a test document with multiple sentences.";
629
630 let chunks = chunker.chunk_text(&text).unwrap();
631
632 // All chunks should have confidence scores
633 for chunk in &chunks {
634 assert!(
635 chunk.metadata.confidence >= 0.0 && chunk.metadata.confidence <= 1.0,
636 "Confidence should be between 0.0 and 1.0"
637 );
638 }
639 }
640
641 #[test]
642 fn test_performance_100_pages() {
643 use std::time::Instant;
644
645 let chunker = DocumentChunker::new(512, 50);
646
647 // Generate 100 pages with ~200 words each (typical PDF page)
648 let page_texts: Vec<(usize, String)> = (1..=100)
649 .map(|page_num| {
650 let words: Vec<String> = (0..200).map(|i| format!("word{}", i)).collect();
651 (page_num, words.join(" "))
652 })
653 .collect();
654
655 let start = Instant::now();
656 let chunks = chunker.chunk_text_with_pages(&page_texts).unwrap();
657 let duration = start.elapsed();
658
659 tracing::debug!("Chunked 100 pages in {:?}", duration);
660 tracing::debug!("Created {} chunks", chunks.len());
661
662 // Target: < 500ms for 100 pages (relaxed for debug builds)
663 // In release mode this should be well under 100ms
664 assert!(
665 duration.as_millis() < 500,
666 "Chunking 100 pages took {:?}, should be < 500ms",
667 duration
668 );
669 }
670}