converge_knowledge/ingest/
pdf.rs

1//! PDF document ingestion and text extraction.
2//!
3//! This module provides functionality to extract text content from PDF files,
4//! chunk the content by pages or sections, and extract metadata such as title,
5//! author, and creation date.
6//!
7//! # Example
8//!
9//! ```rust,no_run
10//! use converge_knowledge::ingest::{PdfIngester, PdfDocument};
11//! use std::path::Path;
12//!
13//! let ingester = PdfIngester::new();
14//! let doc = ingester.ingest_file(Path::new("document.pdf")).unwrap();
15//!
16//! println!("Title: {:?}", doc.title);
17//! println!("Pages: {}", doc.page_count);
18//! for chunk in &doc.chunks {
19//!     println!("Page {}: {}", chunk.page_number, &chunk.content[..100.min(chunk.content.len())]);
20//! }
21//! ```
22
23use crate::error::{Error, Result};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26use std::fs;
27use std::path::{Path, PathBuf};
28use tracing::{debug, info, warn};
29
30/// Default maximum chunk size in characters.
31const DEFAULT_MAX_CHUNK_SIZE: usize = 4000;
32
33/// Minimum chunk size to avoid creating very small chunks.
34const MIN_CHUNK_SIZE: usize = 100;
35
36/// A parsed PDF document with extracted content and metadata.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct PdfDocument {
39    /// Original file path.
40    pub path: PathBuf,
41
42    /// Document title extracted from metadata.
43    pub title: Option<String>,
44
45    /// Document author extracted from metadata.
46    pub author: Option<String>,
47
48    /// Additional metadata key-value pairs.
49    pub metadata: HashMap<String, String>,
50
51    /// Extracted content chunks.
52    pub chunks: Vec<PdfChunk>,
53
54    /// Total number of pages in the document.
55    pub page_count: usize,
56}
57
58impl PdfDocument {
59    /// Create a new empty PDF document.
60    fn new(path: PathBuf) -> Self {
61        Self {
62            path,
63            title: None,
64            author: None,
65            metadata: HashMap::new(),
66            chunks: Vec::new(),
67            page_count: 0,
68        }
69    }
70
71    /// Get the total character count across all chunks.
72    pub fn total_chars(&self) -> usize {
73        self.chunks.iter().map(|c| c.content.len()).sum()
74    }
75
76    /// Get all content as a single string.
77    pub fn full_text(&self) -> String {
78        self.chunks
79            .iter()
80            .map(|c| c.content.as_str())
81            .collect::<Vec<_>>()
82            .join("\n\n")
83    }
84
85    /// Get chunks for a specific page.
86    pub fn chunks_for_page(&self, page: usize) -> Vec<&PdfChunk> {
87        self.chunks
88            .iter()
89            .filter(|c| c.page_number == page)
90            .collect()
91    }
92}
93
94/// A chunk of content extracted from a PDF page.
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct PdfChunk {
97    /// The extracted text content.
98    pub content: String,
99
100    /// The page number this chunk came from (1-indexed).
101    pub page_number: usize,
102
103    /// Index of this chunk within the page (for multi-chunk pages).
104    pub chunk_index: usize,
105}
106
107impl PdfChunk {
108    /// Create a new PDF chunk.
109    fn new(content: String, page_number: usize, chunk_index: usize) -> Self {
110        Self {
111            content,
112            page_number,
113            chunk_index,
114        }
115    }
116
117    /// Check if the chunk is empty or contains only whitespace.
118    pub fn is_empty(&self) -> bool {
119        self.content.trim().is_empty()
120    }
121
122    /// Get the character count of the content.
123    pub fn len(&self) -> usize {
124        self.content.len()
125    }
126}
127
128/// Configuration for PDF ingestion.
129#[derive(Debug, Clone)]
130pub struct PdfIngesterConfig {
131    /// Maximum size of each chunk in characters.
132    pub max_chunk_size: usize,
133
134    /// Whether to preserve original line breaks.
135    pub preserve_line_breaks: bool,
136
137    /// Whether to attempt to detect and handle multi-column layouts.
138    pub handle_multi_column: bool,
139
140    /// Whether to extract metadata from the PDF.
141    pub extract_metadata: bool,
142
143    /// Minimum chunk size (smaller chunks will be merged).
144    pub min_chunk_size: usize,
145}
146
147impl Default for PdfIngesterConfig {
148    fn default() -> Self {
149        Self {
150            max_chunk_size: DEFAULT_MAX_CHUNK_SIZE,
151            preserve_line_breaks: false,
152            handle_multi_column: true,
153            extract_metadata: true,
154            min_chunk_size: MIN_CHUNK_SIZE,
155        }
156    }
157}
158
159impl PdfIngesterConfig {
160    /// Create a new configuration with custom max chunk size.
161    pub fn with_max_chunk_size(mut self, size: usize) -> Self {
162        self.max_chunk_size = size;
163        self
164    }
165
166    /// Set whether to preserve line breaks.
167    pub fn with_preserve_line_breaks(mut self, preserve: bool) -> Self {
168        self.preserve_line_breaks = preserve;
169        self
170    }
171
172    /// Set whether to handle multi-column layouts.
173    pub fn with_multi_column_handling(mut self, handle: bool) -> Self {
174        self.handle_multi_column = handle;
175        self
176    }
177}
178
179/// PDF document ingester for extracting text and metadata.
180#[derive(Debug, Clone)]
181pub struct PdfIngester {
182    config: PdfIngesterConfig,
183}
184
185impl Default for PdfIngester {
186    fn default() -> Self {
187        Self::new()
188    }
189}
190
191impl PdfIngester {
192    /// Create a new PDF ingester with default configuration.
193    pub fn new() -> Self {
194        Self {
195            config: PdfIngesterConfig::default(),
196        }
197    }
198
199    /// Create a new PDF ingester with custom configuration.
200    pub fn with_config(config: PdfIngesterConfig) -> Self {
201        Self { config }
202    }
203
204    /// Ingest a single PDF file.
205    ///
206    /// # Arguments
207    ///
208    /// * `path` - Path to the PDF file
209    ///
210    /// # Returns
211    ///
212    /// A `PdfDocument` containing the extracted content and metadata.
213    ///
214    /// # Errors
215    ///
216    /// Returns an error if the file cannot be read or parsed as a PDF.
217    pub fn ingest_file(&self, path: &Path) -> Result<PdfDocument> {
218        info!(?path, "Ingesting PDF file");
219
220        // Validate file exists and is a PDF
221        if !path.exists() {
222            return Err(Error::ingest(format!("File not found: {}", path.display())));
223        }
224
225        if path.extension().and_then(|e| e.to_str()) != Some("pdf") {
226            warn!(?path, "File does not have .pdf extension");
227        }
228
229        // Read the file
230        let data = fs::read(path).map_err(|e| {
231            Error::ingest(format!("Failed to read PDF file {}: {}", path.display(), e))
232        })?;
233
234        self.ingest_bytes(&data, path.to_path_buf())
235    }
236
237    /// Ingest PDF content from raw bytes.
238    ///
239    /// # Arguments
240    ///
241    /// * `data` - Raw PDF file bytes
242    /// * `path` - Path to associate with the document (for reference)
243    ///
244    /// # Returns
245    ///
246    /// A `PdfDocument` containing the extracted content and metadata.
247    pub fn ingest_bytes(&self, data: &[u8], path: PathBuf) -> Result<PdfDocument> {
248        let mut doc = PdfDocument::new(path.clone());
249
250        // Extract text using pdf-extract
251        let text = pdf_extract::extract_text_from_mem(data).map_err(|e| {
252            Error::ingest(format!(
253                "Failed to extract text from PDF {}: {}",
254                path.display(),
255                e
256            ))
257        })?;
258
259        // Try to extract metadata
260        if self.config.extract_metadata {
261            self.extract_metadata_from_bytes(data, &mut doc);
262        }
263
264        // Process the extracted text
265        let processed_text = self.process_text(&text);
266
267        // Estimate page count from text (pdf-extract doesn't provide page info directly)
268        // We use form feed characters or estimate based on text length
269        doc.page_count = self.estimate_page_count(&text);
270
271        // Create chunks
272        doc.chunks = self.create_chunks(&processed_text, doc.page_count);
273
274        info!(
275            path = %path.display(),
276            pages = doc.page_count,
277            chunks = doc.chunks.len(),
278            chars = doc.total_chars(),
279            "PDF ingestion complete"
280        );
281
282        Ok(doc)
283    }
284
285    /// Extract metadata from PDF bytes.
286    fn extract_metadata_from_bytes(&self, data: &[u8], doc: &mut PdfDocument) {
287        // Try to parse with lopdf for metadata (pdf-extract doesn't expose metadata)
288        // We'll do a simple scan for common metadata patterns in the PDF
289        let text = String::from_utf8_lossy(data);
290
291        // Look for common PDF metadata patterns
292        if let Some(title) = self.extract_metadata_field(&text, "Title") {
293            doc.title = Some(title);
294            doc.metadata
295                .insert("title".to_string(), doc.title.clone().unwrap_or_default());
296        }
297
298        if let Some(author) = self.extract_metadata_field(&text, "Author") {
299            doc.author = Some(author);
300            doc.metadata
301                .insert("author".to_string(), doc.author.clone().unwrap_or_default());
302        }
303
304        if let Some(creator) = self.extract_metadata_field(&text, "Creator") {
305            doc.metadata.insert("creator".to_string(), creator);
306        }
307
308        if let Some(producer) = self.extract_metadata_field(&text, "Producer") {
309            doc.metadata.insert("producer".to_string(), producer);
310        }
311
312        if let Some(creation_date) = self.extract_metadata_field(&text, "CreationDate") {
313            doc.metadata
314                .insert("creation_date".to_string(), creation_date);
315        }
316
317        if let Some(mod_date) = self.extract_metadata_field(&text, "ModDate") {
318            doc.metadata
319                .insert("modification_date".to_string(), mod_date);
320        }
321
322        debug!(
323            title = ?doc.title,
324            author = ?doc.author,
325            metadata_count = doc.metadata.len(),
326            "Extracted PDF metadata"
327        );
328    }
329
330    /// Extract a metadata field value from raw PDF text.
331    fn extract_metadata_field(&self, text: &str, field: &str) -> Option<String> {
332        // Look for patterns like /Title (Value) or /Title <hex>
333        let pattern = format!("/{field}");
334        if let Some(pos) = text.find(&pattern) {
335            let after = &text[pos + pattern.len()..];
336
337            // Handle parentheses-enclosed values
338            if let Some(start) = after.find('(') {
339                let value_start = start + 1;
340                let mut depth = 1;
341                let mut end = value_start;
342
343                for c in after[value_start..].chars() {
344                    match c {
345                        '(' => depth += 1,
346                        ')' => {
347                            depth -= 1;
348                            if depth == 0 {
349                                break;
350                            }
351                        }
352                        _ => {}
353                    }
354                    end += c.len_utf8();
355                }
356
357                if end > value_start {
358                    let value = &after[value_start..end];
359                    let cleaned = value.trim().to_string();
360                    if !cleaned.is_empty() && cleaned.len() < 500 {
361                        return Some(cleaned);
362                    }
363                }
364            }
365        }
366        None
367    }
368
369    /// Process extracted text to clean up formatting issues.
370    fn process_text(&self, text: &str) -> String {
371        let mut result = String::with_capacity(text.len());
372
373        // Handle multi-column layout detection and reordering
374        let processed = if self.config.handle_multi_column {
375            self.handle_multi_column_text(text)
376        } else {
377            text.to_string()
378        };
379
380        // Clean up the text
381        for line in processed.lines() {
382            let trimmed = line.trim();
383
384            if trimmed.is_empty() {
385                if !result.ends_with("\n\n") {
386                    result.push('\n');
387                }
388                continue;
389            }
390
391            // Detect if this line appears to be a continuation
392            let is_continuation = !result.is_empty()
393                && !result.ends_with('\n')
394                && !trimmed.starts_with(char::is_uppercase)
395                && !trimmed.starts_with(|c: char| c.is_ascii_digit());
396
397            if is_continuation {
398                // Check if previous line ended with hyphen (word continuation)
399                if result.ends_with('-') {
400                    result.pop(); // Remove hyphen
401                } else {
402                    result.push(' ');
403                }
404            } else if !result.is_empty() && !result.ends_with('\n') {
405                if self.config.preserve_line_breaks {
406                    result.push('\n');
407                } else {
408                    result.push(' ');
409                }
410            }
411
412            result.push_str(trimmed);
413        }
414
415        // Normalize whitespace
416        self.normalize_whitespace(&result)
417    }
418
419    /// Attempt to handle multi-column layouts by detecting and reordering text.
420    fn handle_multi_column_text(&self, text: &str) -> String {
421        // Simple heuristic: if we detect short lines with consistent lengths,
422        // it might be multi-column. We'll try to identify and merge columns.
423
424        let lines: Vec<&str> = text.lines().collect();
425        if lines.is_empty() {
426            return String::new();
427        }
428
429        // Calculate average line length
430        let total_len: usize = lines.iter().map(|l| l.len()).sum();
431        let avg_len = total_len / lines.len().max(1);
432
433        // If average line length is very short (< 60 chars) and we have many lines,
434        // this might indicate multi-column layout
435        if avg_len < 60 && lines.len() > 20 {
436            debug!(
437                avg_len,
438                lines = lines.len(),
439                "Detected potential multi-column layout"
440            );
441
442            // Try to detect column boundaries by looking for consistent indentation patterns
443            // This is a simplified approach - full multi-column detection would be more complex
444            let mut result = String::new();
445            let mut current_paragraph = String::new();
446
447            for line in lines {
448                let trimmed = line.trim();
449
450                if trimmed.is_empty() {
451                    if !current_paragraph.is_empty() {
452                        result.push_str(&current_paragraph);
453                        result.push_str("\n\n");
454                        current_paragraph.clear();
455                    }
456                    continue;
457                }
458
459                // Detect paragraph breaks: lines starting with caps or numbers after whitespace
460                let is_new_para = !current_paragraph.is_empty()
461                    && (trimmed.starts_with(char::is_uppercase)
462                        || trimmed.starts_with(|c: char| c.is_ascii_digit()));
463
464                if is_new_para && current_paragraph.ends_with('.') {
465                    result.push_str(&current_paragraph);
466                    result.push_str("\n\n");
467                    current_paragraph.clear();
468                }
469
470                if !current_paragraph.is_empty() {
471                    // Check for hyphenation
472                    if current_paragraph.ends_with('-') {
473                        current_paragraph.pop();
474                    } else {
475                        current_paragraph.push(' ');
476                    }
477                }
478                current_paragraph.push_str(trimmed);
479            }
480
481            if !current_paragraph.is_empty() {
482                result.push_str(&current_paragraph);
483            }
484
485            result
486        } else {
487            text.to_string()
488        }
489    }
490
491    /// Normalize whitespace in text.
492    fn normalize_whitespace(&self, text: &str) -> String {
493        let mut result = String::with_capacity(text.len());
494        let mut prev_was_space = false;
495        let mut prev_was_newline = false;
496
497        for c in text.chars() {
498            if c == '\n' {
499                if !prev_was_newline {
500                    result.push('\n');
501                    prev_was_newline = true;
502                }
503                prev_was_space = false;
504            } else if c.is_whitespace() {
505                if !prev_was_space && !prev_was_newline {
506                    result.push(' ');
507                    prev_was_space = true;
508                }
509            } else {
510                result.push(c);
511                prev_was_space = false;
512                prev_was_newline = false;
513            }
514        }
515
516        result.trim().to_string()
517    }
518
519    /// Estimate page count from extracted text.
520    fn estimate_page_count(&self, text: &str) -> usize {
521        // Count form feed characters (page breaks)
522        let form_feeds = text.matches('\x0c').count();
523
524        if form_feeds > 0 {
525            form_feeds + 1
526        } else {
527            // Estimate based on character count (average ~3000 chars per page)
528            let chars = text.len();
529            (chars / 3000).max(1)
530        }
531    }
532
533    /// Create chunks from processed text.
534    fn create_chunks(&self, text: &str, page_count: usize) -> Vec<PdfChunk> {
535        let mut chunks = Vec::new();
536
537        // Split by form feeds first (actual page breaks)
538        let pages: Vec<&str> = text.split('\x0c').collect();
539
540        if pages.len() > 1 {
541            // We have actual page breaks
542            for (page_idx, page_text) in pages.iter().enumerate() {
543                let page_chunks = Self::chunk_text(page_text, self.config.max_chunk_size);
544                for (chunk_idx, chunk_content) in page_chunks.into_iter().enumerate() {
545                    if !chunk_content.trim().is_empty() {
546                        chunks.push(PdfChunk::new(chunk_content, page_idx + 1, chunk_idx));
547                    }
548                }
549            }
550        } else {
551            // No page breaks - distribute chunks across estimated pages
552            let all_chunks = Self::chunk_text(text, self.config.max_chunk_size);
553            let chunks_per_page = (all_chunks.len() / page_count).max(1);
554
555            for (idx, chunk_content) in all_chunks.into_iter().enumerate() {
556                if !chunk_content.trim().is_empty() {
557                    let page_number = (idx / chunks_per_page).min(page_count - 1) + 1;
558                    let chunk_index = idx % chunks_per_page;
559                    chunks.push(PdfChunk::new(chunk_content, page_number, chunk_index));
560                }
561            }
562        }
563
564        // Merge very small chunks with adjacent ones
565        self.merge_small_chunks(chunks)
566    }
567
568    /// Merge chunks that are too small.
569    fn merge_small_chunks(&self, chunks: Vec<PdfChunk>) -> Vec<PdfChunk> {
570        if chunks.is_empty() {
571            return chunks;
572        }
573
574        let mut result: Vec<PdfChunk> = Vec::new();
575
576        for chunk in chunks {
577            if chunk.content.len() < self.config.min_chunk_size {
578                // Try to merge with previous chunk if on same page
579                if let Some(last) = result.last_mut() {
580                    if last.page_number == chunk.page_number
581                        && last.content.len() + chunk.content.len() < self.config.max_chunk_size
582                    {
583                        last.content.push_str("\n\n");
584                        last.content.push_str(&chunk.content);
585                        continue;
586                    }
587                }
588            }
589            result.push(chunk);
590        }
591
592        result
593    }
594
595    /// Chunk text into segments of maximum size, breaking at paragraph or sentence boundaries.
596    ///
597    /// # Arguments
598    ///
599    /// * `text` - The text to chunk
600    /// * `max_chunk_size` - Maximum size of each chunk in characters
601    ///
602    /// # Returns
603    ///
604    /// A vector of text chunks.
605    pub fn chunk_text(text: &str, max_chunk_size: usize) -> Vec<String> {
606        if text.is_empty() {
607            return Vec::new();
608        }
609
610        if text.len() <= max_chunk_size {
611            return vec![text.to_string()];
612        }
613
614        let mut chunks = Vec::new();
615        let mut current_chunk = String::new();
616
617        // Split by paragraphs first (double newlines)
618        let paragraphs: Vec<&str> = text.split("\n\n").collect();
619
620        for para in paragraphs {
621            let para_trimmed = para.trim();
622            if para_trimmed.is_empty() {
623                continue;
624            }
625
626            // If adding this paragraph would exceed max size
627            if !current_chunk.is_empty()
628                && current_chunk.len() + para_trimmed.len() + 2 > max_chunk_size
629            {
630                // Save current chunk and start new one
631                chunks.push(current_chunk.trim().to_string());
632                current_chunk = String::new();
633            }
634
635            // If single paragraph is too large, split by sentences
636            if para_trimmed.len() > max_chunk_size {
637                if !current_chunk.is_empty() {
638                    chunks.push(current_chunk.trim().to_string());
639                    current_chunk = String::new();
640                }
641
642                let sentence_chunks = Self::chunk_by_sentences(para_trimmed, max_chunk_size);
643                chunks.extend(sentence_chunks);
644            } else {
645                if !current_chunk.is_empty() {
646                    current_chunk.push_str("\n\n");
647                }
648                current_chunk.push_str(para_trimmed);
649            }
650        }
651
652        if !current_chunk.trim().is_empty() {
653            chunks.push(current_chunk.trim().to_string());
654        }
655
656        chunks
657    }
658
659    /// Chunk text by sentence boundaries when paragraphs are too large.
660    fn chunk_by_sentences(text: &str, max_chunk_size: usize) -> Vec<String> {
661        let mut chunks = Vec::new();
662        let mut current_chunk = String::new();
663
664        // Simple sentence splitting on . ! ? followed by space or end
665        let sentence_endings = [". ", "! ", "? ", ".\n", "!\n", "?\n"];
666
667        let mut remaining = text;
668        while !remaining.is_empty() {
669            // Find the next sentence boundary
670            let mut best_split = remaining.len();
671
672            for ending in &sentence_endings {
673                if let Some(pos) = remaining.find(ending) {
674                    let split_pos = pos + ending.len();
675                    if split_pos < best_split {
676                        best_split = split_pos;
677                    }
678                }
679            }
680
681            let sentence = &remaining[..best_split];
682            remaining = &remaining[best_split..];
683
684            // Check if adding this sentence would exceed max size
685            if !current_chunk.is_empty() && current_chunk.len() + sentence.len() > max_chunk_size {
686                chunks.push(current_chunk.trim().to_string());
687                current_chunk = String::new();
688            }
689
690            // If single sentence is too large, force split at max_chunk_size
691            if sentence.len() > max_chunk_size {
692                if !current_chunk.is_empty() {
693                    chunks.push(current_chunk.trim().to_string());
694                    current_chunk = String::new();
695                }
696
697                let mut sent_remaining = sentence;
698                while !sent_remaining.is_empty() {
699                    let split_at = max_chunk_size.min(sent_remaining.len());
700                    // Try to split at word boundary
701                    let split_pos = if split_at < sent_remaining.len() {
702                        sent_remaining[..split_at]
703                            .rfind(' ')
704                            .map(|p| p + 1)
705                            .unwrap_or(split_at)
706                    } else {
707                        split_at
708                    };
709
710                    chunks.push(sent_remaining[..split_pos].trim().to_string());
711                    sent_remaining = &sent_remaining[split_pos..];
712                }
713            } else {
714                current_chunk.push_str(sentence);
715            }
716        }
717
718        if !current_chunk.trim().is_empty() {
719            chunks.push(current_chunk.trim().to_string());
720        }
721
722        chunks
723    }
724
725    /// Ingest all PDF files in a directory.
726    ///
727    /// # Arguments
728    ///
729    /// * `dir` - Directory path to search
730    /// * `recursive` - Whether to search subdirectories
731    ///
732    /// # Returns
733    ///
734    /// A vector of successfully parsed PDF documents.
735    ///
736    /// # Errors
737    ///
738    /// Returns an error if the directory cannot be read.
739    pub fn ingest_directory(&self, dir: &Path, recursive: bool) -> Result<Vec<PdfDocument>> {
740        info!(?dir, recursive, "Ingesting PDF files from directory");
741
742        if !dir.exists() {
743            return Err(Error::ingest(format!(
744                "Directory not found: {}",
745                dir.display()
746            )));
747        }
748
749        if !dir.is_dir() {
750            return Err(Error::ingest(format!(
751                "Path is not a directory: {}",
752                dir.display()
753            )));
754        }
755
756        let mut documents = Vec::new();
757        self.ingest_directory_recursive(dir, recursive, &mut documents)?;
758
759        info!(
760            dir = %dir.display(),
761            count = documents.len(),
762            "Directory ingestion complete"
763        );
764
765        Ok(documents)
766    }
767
768    /// Recursive helper for directory ingestion.
769    fn ingest_directory_recursive(
770        &self,
771        dir: &Path,
772        recursive: bool,
773        documents: &mut Vec<PdfDocument>,
774    ) -> Result<()> {
775        let entries = fs::read_dir(dir).map_err(|e| {
776            Error::ingest(format!("Failed to read directory {}: {}", dir.display(), e))
777        })?;
778
779        for entry in entries {
780            let entry = entry.map_err(|e| Error::ingest(format!("Failed to read entry: {}", e)))?;
781            let path = entry.path();
782
783            if path.is_dir() {
784                if recursive {
785                    self.ingest_directory_recursive(&path, recursive, documents)?;
786                }
787            } else if path.extension().and_then(|e| e.to_str()) == Some("pdf") {
788                match self.ingest_file(&path) {
789                    Ok(doc) => documents.push(doc),
790                    Err(e) => {
791                        warn!(path = %path.display(), error = %e, "Failed to ingest PDF");
792                    }
793                }
794            }
795        }
796
797        Ok(())
798    }
799}
800
801#[cfg(test)]
802mod tests {
803    use super::*;
804
805    #[test]
806    fn test_pdf_ingester_creation() {
807        let ingester = PdfIngester::new();
808        assert_eq!(ingester.config.max_chunk_size, DEFAULT_MAX_CHUNK_SIZE);
809    }
810
811    #[test]
812    fn test_pdf_ingester_with_config() {
813        let config = PdfIngesterConfig::default()
814            .with_max_chunk_size(2000)
815            .with_preserve_line_breaks(true);
816
817        let ingester = PdfIngester::with_config(config);
818        assert_eq!(ingester.config.max_chunk_size, 2000);
819        assert!(ingester.config.preserve_line_breaks);
820    }
821
822    #[test]
823    fn test_chunk_text_small() {
824        let text = "This is a small text.";
825        let chunks = PdfIngester::chunk_text(text, 4000);
826
827        assert_eq!(chunks.len(), 1);
828        assert_eq!(chunks[0], text);
829    }
830
831    #[test]
832    fn test_chunk_text_paragraphs() {
833        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
834        let chunks = PdfIngester::chunk_text(text, 30);
835
836        assert!(chunks.len() >= 2);
837        assert!(chunks[0].contains("First"));
838    }
839
840    #[test]
841    fn test_chunk_text_long_paragraph() {
842        let text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four. This is sentence five.";
843        let chunks = PdfIngester::chunk_text(text, 50);
844
845        assert!(chunks.len() > 1);
846        for chunk in &chunks {
847            assert!(chunk.len() <= 100); // Allow some overflow for sentence completion
848        }
849    }
850
851    #[test]
852    fn test_chunk_text_empty() {
853        let chunks = PdfIngester::chunk_text("", 4000);
854        assert!(chunks.is_empty());
855    }
856
857    #[test]
858    fn test_pdf_document_new() {
859        let doc = PdfDocument::new(PathBuf::from("test.pdf"));
860
861        assert_eq!(doc.path, PathBuf::from("test.pdf"));
862        assert!(doc.title.is_none());
863        assert!(doc.author.is_none());
864        assert!(doc.chunks.is_empty());
865        assert_eq!(doc.page_count, 0);
866    }
867
868    #[test]
869    fn test_pdf_document_full_text() {
870        let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
871        doc.chunks
872            .push(PdfChunk::new("First chunk.".to_string(), 1, 0));
873        doc.chunks
874            .push(PdfChunk::new("Second chunk.".to_string(), 1, 1));
875
876        let full = doc.full_text();
877        assert!(full.contains("First chunk."));
878        assert!(full.contains("Second chunk."));
879    }
880
881    #[test]
882    fn test_pdf_document_total_chars() {
883        let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
884        doc.chunks.push(PdfChunk::new("Hello".to_string(), 1, 0));
885        doc.chunks.push(PdfChunk::new("World".to_string(), 2, 0));
886
887        assert_eq!(doc.total_chars(), 10);
888    }
889
890    #[test]
891    fn test_pdf_document_chunks_for_page() {
892        let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
893        doc.chunks
894            .push(PdfChunk::new("Page 1 chunk 1".to_string(), 1, 0));
895        doc.chunks
896            .push(PdfChunk::new("Page 1 chunk 2".to_string(), 1, 1));
897        doc.chunks
898            .push(PdfChunk::new("Page 2 chunk 1".to_string(), 2, 0));
899
900        let page1_chunks = doc.chunks_for_page(1);
901        assert_eq!(page1_chunks.len(), 2);
902
903        let page2_chunks = doc.chunks_for_page(2);
904        assert_eq!(page2_chunks.len(), 1);
905    }
906
907    #[test]
908    fn test_pdf_chunk_creation() {
909        let chunk = PdfChunk::new("Content".to_string(), 5, 2);
910
911        assert_eq!(chunk.content, "Content");
912        assert_eq!(chunk.page_number, 5);
913        assert_eq!(chunk.chunk_index, 2);
914    }
915
916    #[test]
917    fn test_pdf_chunk_is_empty() {
918        let empty_chunk = PdfChunk::new("   \n\t  ".to_string(), 1, 0);
919        assert!(empty_chunk.is_empty());
920
921        let non_empty_chunk = PdfChunk::new("Content".to_string(), 1, 0);
922        assert!(!non_empty_chunk.is_empty());
923    }
924
925    #[test]
926    fn test_pdf_chunk_len() {
927        let chunk = PdfChunk::new("Hello World".to_string(), 1, 0);
928        assert_eq!(chunk.len(), 11);
929    }
930
931    #[test]
932    fn test_normalize_whitespace() {
933        let ingester = PdfIngester::new();
934
935        let text = "Hello    world\n\n\n\nTest";
936        let normalized = ingester.normalize_whitespace(text);
937
938        assert!(!normalized.contains("    "));
939        assert!(!normalized.contains("\n\n\n"));
940    }
941
942    #[test]
943    fn test_estimate_page_count_with_form_feeds() {
944        let ingester = PdfIngester::new();
945
946        let text = "Page 1\x0cPage 2\x0cPage 3";
947        assert_eq!(ingester.estimate_page_count(text), 3);
948    }
949
950    #[test]
951    fn test_estimate_page_count_without_form_feeds() {
952        let ingester = PdfIngester::new();
953
954        // ~6000 chars should be ~2 pages
955        let text = "a".repeat(6000);
956        assert_eq!(ingester.estimate_page_count(&text), 2);
957    }
958
959    #[test]
960    fn test_extract_metadata_field() {
961        let ingester = PdfIngester::new();
962
963        let pdf_content = "/Title (Test Document)";
964        let title = ingester.extract_metadata_field(pdf_content, "Title");
965        assert_eq!(title, Some("Test Document".to_string()));
966    }
967
968    #[test]
969    fn test_extract_metadata_field_with_nested_parens() {
970        let ingester = PdfIngester::new();
971
972        let pdf_content = "/Author (John (Jack) Doe)";
973        let author = ingester.extract_metadata_field(pdf_content, "Author");
974        assert_eq!(author, Some("John (Jack) Doe".to_string()));
975    }
976
977    #[test]
978    fn test_extract_metadata_field_not_found() {
979        let ingester = PdfIngester::new();
980
981        let pdf_content = "/Title (Test)";
982        let author = ingester.extract_metadata_field(pdf_content, "Author");
983        assert!(author.is_none());
984    }
985
986    #[test]
987    fn test_ingest_file_not_found() {
988        let ingester = PdfIngester::new();
989        let result = ingester.ingest_file(Path::new("/nonexistent/path/file.pdf"));
990
991        assert!(result.is_err());
992        let err_msg = result.unwrap_err().to_string();
993        assert!(err_msg.contains("not found") || err_msg.contains("Ingest"));
994    }
995
996    #[test]
997    fn test_ingest_directory_not_found() {
998        let ingester = PdfIngester::new();
999        let result = ingester.ingest_directory(Path::new("/nonexistent/directory"), false);
1000
1001        assert!(result.is_err());
1002    }
1003
1004    #[test]
1005    fn test_config_builder() {
1006        let config = PdfIngesterConfig::default()
1007            .with_max_chunk_size(5000)
1008            .with_preserve_line_breaks(true)
1009            .with_multi_column_handling(false);
1010
1011        assert_eq!(config.max_chunk_size, 5000);
1012        assert!(config.preserve_line_breaks);
1013        assert!(!config.handle_multi_column);
1014    }
1015
1016    #[test]
1017    fn test_create_chunks_with_form_feeds() {
1018        let ingester = PdfIngester::new();
1019
1020        let text = "Page one content.\x0cPage two content.\x0cPage three content.";
1021        let chunks = ingester.create_chunks(text, 3);
1022
1023        assert_eq!(chunks.len(), 3);
1024        assert_eq!(chunks[0].page_number, 1);
1025        assert_eq!(chunks[1].page_number, 2);
1026        assert_eq!(chunks[2].page_number, 3);
1027    }
1028
1029    #[test]
1030    fn test_handle_multi_column_short_lines() {
1031        let ingester = PdfIngester::new();
1032
1033        // Simulate multi-column text with short lines
1034        let lines: Vec<String> = (0..30).map(|i| format!("Line {}", i)).collect();
1035        let text = lines.join("\n");
1036
1037        let processed = ingester.handle_multi_column_text(&text);
1038        // Should consolidate short lines
1039        assert!(processed.lines().count() < 30 || processed.contains(' '));
1040    }
1041
1042    #[test]
1043    fn test_merge_small_chunks() {
1044        let ingester = PdfIngester::new();
1045
1046        let chunks = vec![
1047            PdfChunk::new("Small".to_string(), 1, 0),
1048            PdfChunk::new("Also small".to_string(), 1, 1),
1049            PdfChunk::new(
1050                "A much longer chunk that should not be merged.".to_string(),
1051                1,
1052                2,
1053            ),
1054        ];
1055
1056        let merged = ingester.merge_small_chunks(chunks);
1057        // First two small chunks should be merged
1058        assert!(merged.len() <= 2);
1059    }
1060}
converge_knowledge/ingest/pdf.rs

converge_knowledge/ingest/
pdf.rs