Skip to main content

oxidize_pdf/ai/
chunking.rs

1//! Document chunking for RAG (Retrieval Augmented Generation)
2//!
3//! This module provides functionality to split PDF documents into smaller chunks
4//! suitable for processing with Large Language Models (LLMs). LLMs have token limits,
5//! so long documents need to be split into manageable pieces while preserving context.
6//!
7//! # Example
8//!
9//! ```no_run
10//! use oxidize_pdf::ai::DocumentChunker;
11//! use oxidize_pdf::parser::{PdfReader, PdfDocument};
12//!
13//! # fn main() -> oxidize_pdf::Result<()> {
14//! let reader = PdfReader::open("large_document.pdf")?;
15//! let pdf_doc = PdfDocument::new(reader);
16//! let text_pages = pdf_doc.extract_text()?;
17//!
18//! let chunker = DocumentChunker::new(512, 50);  // 512 tokens, 50 overlap
19//! let page_texts: Vec<(usize, String)> = text_pages.iter()
20//!     .enumerate()
21//!     .map(|(idx, page)| (idx + 1, page.text.clone()))
22//!     .collect();
23//! let chunks = chunker.chunk_text_with_pages(&page_texts)?;
24//!
25//! println!("Created {} chunks", chunks.len());
26//! for chunk in &chunks {
27//!     println!("Chunk {}: {} tokens", chunk.id, chunk.tokens);
28//! }
29//! # Ok(())
30//! # }
31//! ```
32
33use crate::{Document, Result};
34
35/// A chunk of a PDF document suitable for LLM processing
36///
37/// Each chunk represents a portion of the document's text with associated metadata
38/// that helps maintain context during retrieval and generation.
39#[derive(Debug, Clone)]
40pub struct DocumentChunk {
41    /// Unique identifier for this chunk (e.g., "chunk_0", "chunk_1")
42    pub id: String,
43
44    /// The text content of this chunk
45    pub content: String,
46
47    /// Estimated number of tokens in this chunk
48    pub tokens: usize,
49
50    /// Page numbers where this chunk's content appears (1-indexed)
51    pub page_numbers: Vec<usize>,
52
53    /// Index of this chunk in the sequence (0-indexed)
54    pub chunk_index: usize,
55
56    /// Additional metadata for this chunk
57    pub metadata: ChunkMetadata,
58}
59
60/// Metadata for a document chunk
61#[derive(Debug, Clone, Default)]
62pub struct ChunkMetadata {
63    /// Position information about where this chunk appears in the document
64    pub position: ChunkPosition,
65
66    /// Confidence score for text extraction quality (0.0-1.0)
67    /// 1.0 = high confidence, 0.0 = low confidence
68    pub confidence: f32,
69
70    /// Whether this chunk respects sentence boundaries
71    pub sentence_boundary_respected: bool,
72}
73
74/// Position information for a chunk within the document
75#[derive(Debug, Clone, Default)]
76pub struct ChunkPosition {
77    /// Character offset where this chunk starts in the full document text
78    pub start_char: usize,
79
80    /// Character offset where this chunk ends in the full document text
81    pub end_char: usize,
82
83    /// First page number where this chunk appears (1-indexed)
84    pub first_page: usize,
85
86    /// Last page number where this chunk appears (1-indexed)
87    pub last_page: usize,
88}
89
90/// Configurable document chunker for splitting PDFs into LLM-friendly pieces
91///
92/// The chunker uses a simple fixed-size strategy with overlap to ensure context
93/// is preserved between consecutive chunks.
94///
95/// # Example
96///
97/// ```no_run
98/// use oxidize_pdf::ai::DocumentChunker;
99///
100/// // Create a chunker with 512 token chunks and 50 token overlap
101/// let chunker = DocumentChunker::new(512, 50);
102/// ```
103#[derive(Debug, Clone)]
104pub struct DocumentChunker {
105    /// Target size for each chunk in tokens
106    chunk_size: usize,
107
108    /// Number of tokens to overlap between consecutive chunks
109    overlap: usize,
110}
111
112impl DocumentChunker {
113    /// Create a new document chunker with specified chunk size and overlap
114    ///
115    /// # Arguments
116    ///
117    /// * `chunk_size` - Target number of tokens per chunk (typical: 256-1024)
118    /// * `overlap` - Number of tokens to overlap between chunks (typical: 10-100)
119    ///
120    /// # Example
121    ///
122    /// ```
123    /// use oxidize_pdf::ai::DocumentChunker;
124    ///
125    /// // For GPT-3.5/4 with 4K context, use smaller chunks
126    /// let chunker = DocumentChunker::new(512, 50);
127    ///
128    /// // For Claude with 100K context, you can use larger chunks
129    /// let chunker_large = DocumentChunker::new(2048, 200);
130    /// ```
131    pub fn new(chunk_size: usize, overlap: usize) -> Self {
132        Self {
133            chunk_size,
134            overlap,
135        }
136    }
137
138    /// Create a default chunker with sensible defaults for most LLMs
139    ///
140    /// Uses 512 token chunks with 50 token overlap, which works well with
141    /// GPT-3.5, GPT-4, and similar models.
142    pub fn default() -> Self {
143        Self::new(512, 50)
144    }
145
146    /// Chunk a PDF document into pieces suitable for LLM processing
147    ///
148    /// # Arguments
149    ///
150    /// * `doc` - The PDF document to chunk
151    ///
152    /// # Returns
153    ///
154    /// A vector of `DocumentChunk` objects, each containing a portion of the document.
155    ///
156    /// # Example
157    ///
158    /// ```no_run
159    /// use oxidize_pdf::{Document, ai::DocumentChunker};
160    ///
161    /// # fn main() -> oxidize_pdf::Result<()> {
162    /// let doc = Document::new();
163    /// // Add pages to doc...
164    /// let chunker = DocumentChunker::new(512, 50);
165    /// let chunks = chunker.chunk_document(&doc)?;
166    ///
167    /// for chunk in chunks {
168    ///     println!("Processing chunk {}: {} tokens", chunk.id, chunk.tokens);
169    ///     // Send to LLM for processing...
170    /// }
171    /// # Ok(())
172    /// # }
173    /// ```
174    pub fn chunk_document(&self, doc: &Document) -> Result<Vec<DocumentChunk>> {
175        // Extract all text from the document
176        let full_text = doc.extract_text()?;
177
178        // Chunk the text
179        self.chunk_text(&full_text)
180    }
181
182    /// Chunk a text string into fixed-size pieces with overlap
183    ///
184    /// This is the core chunking algorithm that:
185    /// 1. Tokenizes the text (simple whitespace split)
186    /// 2. Creates chunks of `chunk_size` tokens
187    /// 3. Applies `overlap` tokens between consecutive chunks
188    /// 4. Respects sentence boundaries when possible
189    ///
190    /// # Arguments
191    ///
192    /// * `text` - The text to chunk
193    ///
194    /// # Returns
195    ///
196    /// A vector of `DocumentChunk` objects
197    ///
198    /// # Example
199    ///
200    /// ```
201    /// use oxidize_pdf::ai::DocumentChunker;
202    ///
203    /// let chunker = DocumentChunker::new(10, 2);
204    /// let text = "This is some sample text that will be chunked into smaller pieces";
205    /// let chunks = chunker.chunk_text(text).unwrap();
206    /// println!("Created {} chunks", chunks.len());
207    /// ```
208    pub fn chunk_text(&self, text: &str) -> Result<Vec<DocumentChunk>> {
209        // For simple text chunking, we don't have page information
210        self.chunk_text_internal(text, &[], 0)
211    }
212
213    /// Chunk text with page information for accurate page tracking
214    ///
215    /// # Arguments
216    ///
217    /// * `page_texts` - Vector of (page_number, text) tuples (1-indexed page numbers)
218    ///
219    /// # Returns
220    ///
221    /// A vector of `DocumentChunk` objects with page tracking
222    pub fn chunk_text_with_pages(
223        &self,
224        page_texts: &[(usize, String)],
225    ) -> Result<Vec<DocumentChunk>> {
226        // Combine all page texts with page markers
227        let mut full_text = String::new();
228        let mut page_boundaries = vec![0]; // Character positions where pages start
229
230        for (_page_num, text) in page_texts {
231            if !full_text.is_empty() {
232                full_text.push_str("\n\n"); // Page separator
233            }
234            full_text.push_str(text);
235            page_boundaries.push(full_text.len());
236        }
237
238        let page_numbers: Vec<usize> = page_texts.iter().map(|(num, _)| *num).collect();
239
240        self.chunk_text_internal(&full_text, &page_boundaries, page_numbers[0])
241    }
242
243    /// Internal chunking implementation with page tracking
244    fn chunk_text_internal(
245        &self,
246        text: &str,
247        page_boundaries: &[usize],
248        first_page: usize,
249    ) -> Result<Vec<DocumentChunk>> {
250        if text.is_empty() {
251            return Ok(Vec::new());
252        }
253
254        // Tokenize: simple whitespace split for now
255        // Enhancement: Use proper tokenizer (tiktoken) for accurate token counts
256        // Priority: MEDIUM - Current whitespace split provides estimates
257        // Accurate tokenization would require tiktoken-rs external dependency
258        // Target: v1.7.0 for LLM integration improvements
259        let tokens: Vec<&str> = text.split_whitespace().collect();
260
261        if tokens.is_empty() {
262            return Ok(Vec::new());
263        }
264
265        let mut chunks = Vec::new();
266        let mut start = 0;
267        let mut chunk_idx = 0;
268        let mut char_offset = 0;
269
270        while start < tokens.len() {
271            // Calculate end position for this chunk
272            let mut end = (start + self.chunk_size).min(tokens.len());
273
274            // Try to respect sentence boundaries
275            let sentence_boundary_respected = if end < tokens.len() && end > start {
276                // Look for sentence endings in the last few tokens
277                let search_window = (end.saturating_sub(10)..end).rev();
278                let mut found_boundary = false;
279
280                for i in search_window {
281                    let token = tokens[i];
282                    if token.ends_with('.') || token.ends_with('!') || token.ends_with('?') {
283                        end = i + 1; // Include the sentence-ending token
284                        found_boundary = true;
285                        break;
286                    }
287                }
288                found_boundary
289            } else {
290                false
291            };
292
293            // Extract chunk tokens
294            let chunk_tokens = &tokens[start..end];
295
296            // Join tokens back into text
297            let content = chunk_tokens.join(" ");
298
299            // Calculate character positions
300            let start_char = char_offset;
301            let end_char = char_offset + content.len();
302            char_offset = end_char;
303
304            // Determine page numbers for this chunk
305            let (page_nums, first_pg, last_pg) = if page_boundaries.is_empty() {
306                (Vec::new(), 0, 0)
307            } else {
308                let mut pages = Vec::new();
309                let mut first = first_page;
310                let mut last = first_page;
311
312                for (idx, &boundary) in page_boundaries.iter().enumerate().skip(1) {
313                    if start_char < boundary && end_char > page_boundaries[idx - 1] {
314                        let page_num = first_page + idx - 1;
315                        pages.push(page_num);
316                        if pages.len() == 1 {
317                            first = page_num;
318                        }
319                        last = page_num;
320                    }
321                }
322
323                if pages.is_empty() {
324                    // Chunk is beyond all tracked pages
325                    pages.push(first_page);
326                    first = first_page;
327                    last = first_page;
328                }
329
330                (pages, first, last)
331            };
332
333            // Create chunk
334            let chunk = DocumentChunk {
335                id: format!("chunk_{}", chunk_idx),
336                content,
337                tokens: chunk_tokens.len(),
338                page_numbers: page_nums.clone(),
339                chunk_index: chunk_idx,
340                metadata: ChunkMetadata {
341                    position: ChunkPosition {
342                        start_char,
343                        end_char,
344                        first_page: first_pg,
345                        last_page: last_pg,
346                    },
347                    confidence: 1.0, // Default high confidence for text-based chunking
348                    sentence_boundary_respected,
349                },
350            };
351
352            chunks.push(chunk);
353            chunk_idx += 1;
354
355            // Move start position with overlap
356            if end < tokens.len() {
357                // Not at end yet, apply overlap
358                start = end.saturating_sub(self.overlap);
359
360                // Ensure we make progress (avoid infinite loop)
361                if start + self.chunk_size <= end {
362                    start = end;
363                }
364            } else {
365                // Reached the end
366                break;
367            }
368        }
369
370        Ok(chunks)
371    }
372
373    /// Estimate the number of tokens in a text string
374    ///
375    /// Uses a simple approximation: 1 token ≈ 0.75 words (or ~1.33 tokens per word).
376    /// This is reasonably accurate for English text with GPT models.
377    ///
378    /// # Arguments
379    ///
380    /// * `text` - The text to estimate tokens for
381    ///
382    /// # Returns
383    ///
384    /// Estimated number of tokens
385    ///
386    /// # Note
387    ///
388    /// This is an approximation. For exact token counts, integrate with
389    /// a proper tokenizer like tiktoken.
390    pub fn estimate_tokens(text: &str) -> usize {
391        // Simple approximation: count words
392        // 1 token ≈ 0.75 words for English text
393        let words = text.split_whitespace().count();
394        ((words as f32) * 1.33) as usize
395    }
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn test_basic_chunking() {
404        let chunker = DocumentChunker::new(10, 2);
405
406        // Create text with exactly 25 words
407        let text = (0..25)
408            .map(|i| format!("word{}", i))
409            .collect::<Vec<_>>()
410            .join(" ");
411
412        let chunks = chunker.chunk_text(&text).unwrap();
413
414        // Should create 3 chunks:
415        // Chunk 0: words 0-9 (10 tokens)
416        // Chunk 1: words 8-17 (10 tokens, overlap of 2)
417        // Chunk 2: words 16-24 (9 tokens, overlap of 2)
418        assert_eq!(chunks.len(), 3, "Should create 3 chunks");
419
420        // Check first chunk
421        assert_eq!(chunks[0].tokens, 10);
422        assert_eq!(chunks[0].chunk_index, 0);
423        assert_eq!(chunks[0].id, "chunk_0");
424        assert_eq!(chunks[0].metadata.position.start_char, 0);
425
426        // Check second chunk
427        assert_eq!(chunks[1].tokens, 10);
428        assert_eq!(chunks[1].chunk_index, 1);
429
430        // Check third chunk
431        assert_eq!(chunks[2].tokens, 9);
432        assert_eq!(chunks[2].chunk_index, 2);
433    }
434
435    #[test]
436    fn test_overlap_preserves_context() {
437        let chunker = DocumentChunker::new(5, 2);
438
439        // Text: "a b c d e f g h i j"
440        let text = "a b c d e f g h i j";
441
442        let chunks = chunker.chunk_text(&text).unwrap();
443
444        // Chunk 0: a b c d e (positions 0-4)
445        // Chunk 1: d e f g h (positions 3-7, overlap of 2: d e)
446        // Chunk 2: g h i j (positions 6-9, overlap of 2: g h)
447
448        // Check overlap between chunk 0 and 1
449        let chunk0_end = chunks[0]
450            .content
451            .split_whitespace()
452            .rev()
453            .take(2)
454            .collect::<Vec<_>>();
455        let chunk1_start = chunks[1]
456            .content
457            .split_whitespace()
458            .take(2)
459            .collect::<Vec<_>>();
460
461        assert_eq!(chunk0_end, vec!["e", "d"]);
462        assert_eq!(chunk1_start, vec!["d", "e"]);
463    }
464
465    #[test]
466    fn test_empty_text() {
467        let chunker = DocumentChunker::new(10, 2);
468        let chunks = chunker.chunk_text("").unwrap();
469        assert_eq!(chunks.len(), 0);
470    }
471
472    #[test]
473    fn test_text_smaller_than_chunk_size() {
474        let chunker = DocumentChunker::new(100, 10);
475        let text = "just a few words";
476
477        let chunks = chunker.chunk_text(&text).unwrap();
478
479        assert_eq!(chunks.len(), 1);
480        assert_eq!(chunks[0].tokens, 4);
481    }
482
483    #[test]
484    fn test_token_estimation() {
485        // "hello world" = 2 words ≈ 2.66 tokens
486        let tokens = DocumentChunker::estimate_tokens("hello world");
487        assert!(
488            tokens >= 2 && tokens <= 3,
489            "Expected ~2-3 tokens, got {}",
490            tokens
491        );
492
493        // Empty text
494        assert_eq!(DocumentChunker::estimate_tokens(""), 0);
495
496        // Longer text: 100 words ≈ 133 tokens
497        let long_text = (0..100)
498            .map(|i| format!("word{}", i))
499            .collect::<Vec<_>>()
500            .join(" ");
501        let tokens_long = DocumentChunker::estimate_tokens(&long_text);
502        assert!(
503            tokens_long >= 120 && tokens_long <= 140,
504            "Expected ~133 tokens, got {}",
505            tokens_long
506        );
507    }
508
509    #[test]
510    fn test_chunk_ids_are_unique() {
511        let chunker = DocumentChunker::new(5, 1);
512        let text = (0..20)
513            .map(|i| format!("word{}", i))
514            .collect::<Vec<_>>()
515            .join(" ");
516
517        let chunks = chunker.chunk_text(&text).unwrap();
518
519        let ids: Vec<String> = chunks.iter().map(|c| c.id.clone()).collect();
520        let unique_ids: std::collections::HashSet<_> = ids.iter().collect();
521
522        assert_eq!(
523            ids.len(),
524            unique_ids.len(),
525            "All chunk IDs should be unique"
526        );
527    }
528
529    #[test]
530    fn test_sentence_boundary_detection() {
531        let chunker = DocumentChunker::new(10, 2);
532
533        let text = "This is the first sentence. This is the second sentence. This is the third sentence. And here is a fourth one.";
534
535        let chunks = chunker.chunk_text(&text).unwrap();
536
537        // At least some chunks should respect sentence boundaries
538        let has_boundary_respect = chunks
539            .iter()
540            .any(|c| c.metadata.sentence_boundary_respected);
541        assert!(
542            has_boundary_respect,
543            "At least some chunks should respect sentence boundaries"
544        );
545
546        // Check that sentences aren't broken in the middle (chunks should end with punctuation or be the last chunk)
547        for (i, chunk) in chunks.iter().enumerate() {
548            if i < chunks.len() - 1 && chunk.metadata.sentence_boundary_respected {
549                assert!(
550                    chunk.content.ends_with('.')
551                        || chunk.content.ends_with('!')
552                        || chunk.content.ends_with('?'),
553                    "Chunk {} should end with sentence punctuation",
554                    i
555                );
556            }
557        }
558    }
559
560    #[test]
561    fn test_page_tracking() {
562        let chunker = DocumentChunker::new(10, 2);
563
564        let page_texts = vec![
565            (1, "This is page one content.".to_string()),
566            (2, "This is page two content.".to_string()),
567            (3, "This is page three content.".to_string()),
568        ];
569
570        let chunks = chunker.chunk_text_with_pages(&page_texts).unwrap();
571
572        // All chunks should have page information
573        for chunk in &chunks {
574            assert!(
575                !chunk.page_numbers.is_empty(),
576                "Chunk should have page numbers"
577            );
578            assert!(
579                chunk.metadata.position.first_page > 0,
580                "First page should be > 0"
581            );
582            assert!(
583                chunk.metadata.position.last_page > 0,
584                "Last page should be > 0"
585            );
586        }
587
588        // First chunk should start at page 1
589        assert_eq!(
590            chunks[0].metadata.position.first_page, 1,
591            "First chunk should start at page 1"
592        );
593    }
594
595    #[test]
596    fn test_metadata_position_tracking() {
597        let chunker = DocumentChunker::new(5, 1);
598
599        let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
600
601        let chunks = chunker.chunk_text(&text).unwrap();
602
603        // Check that positions are sequential and non-overlapping in character space
604        for i in 0..chunks.len() - 1 {
605            assert!(
606                chunks[i].metadata.position.end_char
607                    <= chunks[i + 1].metadata.position.start_char + 10,
608                "Chunks should have reasonable character positions"
609            );
610        }
611
612        // First chunk should start at position 0
613        assert_eq!(chunks[0].metadata.position.start_char, 0);
614
615        // Each chunk should have a meaningful character range
616        for chunk in &chunks {
617            assert!(
618                chunk.metadata.position.end_char > chunk.metadata.position.start_char,
619                "End char should be greater than start char"
620            );
621        }
622    }
623
624    #[test]
625    fn test_confidence_scores() {
626        let chunker = DocumentChunker::new(10, 2);
627
628        let text = "This is a test document with multiple sentences.";
629
630        let chunks = chunker.chunk_text(&text).unwrap();
631
632        // All chunks should have confidence scores
633        for chunk in &chunks {
634            assert!(
635                chunk.metadata.confidence >= 0.0 && chunk.metadata.confidence <= 1.0,
636                "Confidence should be between 0.0 and 1.0"
637            );
638        }
639    }
640
641    #[test]
642    fn test_performance_100_pages() {
643        use std::time::Instant;
644
645        let chunker = DocumentChunker::new(512, 50);
646
647        // Generate 100 pages with ~200 words each (typical PDF page)
648        let page_texts: Vec<(usize, String)> = (1..=100)
649            .map(|page_num| {
650                let words: Vec<String> = (0..200).map(|i| format!("word{}", i)).collect();
651                (page_num, words.join(" "))
652            })
653            .collect();
654
655        let start = Instant::now();
656        let chunks = chunker.chunk_text_with_pages(&page_texts).unwrap();
657        let duration = start.elapsed();
658
659        tracing::debug!("Chunked 100 pages in {:?}", duration);
660        tracing::debug!("Created {} chunks", chunks.len());
661
662        // Target: < 500ms for 100 pages (relaxed for debug builds)
663        // In release mode this should be well under 100ms
664        assert!(
665            duration.as_millis() < 500,
666            "Chunking 100 pages took {:?}, should be < 500ms",
667            duration
668        );
669    }
670}