1use crate::error::{Error, Result};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26use std::fs;
27use std::path::{Path, PathBuf};
28use tracing::{debug, info, warn};
29
30const DEFAULT_MAX_CHUNK_SIZE: usize = 4000;
32
33const MIN_CHUNK_SIZE: usize = 100;
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct PdfDocument {
39 pub path: PathBuf,
41
42 pub title: Option<String>,
44
45 pub author: Option<String>,
47
48 pub metadata: HashMap<String, String>,
50
51 pub chunks: Vec<PdfChunk>,
53
54 pub page_count: usize,
56}
57
58impl PdfDocument {
59 fn new(path: PathBuf) -> Self {
61 Self {
62 path,
63 title: None,
64 author: None,
65 metadata: HashMap::new(),
66 chunks: Vec::new(),
67 page_count: 0,
68 }
69 }
70
71 pub fn total_chars(&self) -> usize {
73 self.chunks.iter().map(|c| c.content.len()).sum()
74 }
75
76 pub fn full_text(&self) -> String {
78 self.chunks
79 .iter()
80 .map(|c| c.content.as_str())
81 .collect::<Vec<_>>()
82 .join("\n\n")
83 }
84
85 pub fn chunks_for_page(&self, page: usize) -> Vec<&PdfChunk> {
87 self.chunks
88 .iter()
89 .filter(|c| c.page_number == page)
90 .collect()
91 }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct PdfChunk {
97 pub content: String,
99
100 pub page_number: usize,
102
103 pub chunk_index: usize,
105}
106
107impl PdfChunk {
108 fn new(content: String, page_number: usize, chunk_index: usize) -> Self {
110 Self {
111 content,
112 page_number,
113 chunk_index,
114 }
115 }
116
117 pub fn is_empty(&self) -> bool {
119 self.content.trim().is_empty()
120 }
121
122 pub fn len(&self) -> usize {
124 self.content.len()
125 }
126}
127
128#[derive(Debug, Clone)]
130pub struct PdfIngesterConfig {
131 pub max_chunk_size: usize,
133
134 pub preserve_line_breaks: bool,
136
137 pub handle_multi_column: bool,
139
140 pub extract_metadata: bool,
142
143 pub min_chunk_size: usize,
145}
146
147impl Default for PdfIngesterConfig {
148 fn default() -> Self {
149 Self {
150 max_chunk_size: DEFAULT_MAX_CHUNK_SIZE,
151 preserve_line_breaks: false,
152 handle_multi_column: true,
153 extract_metadata: true,
154 min_chunk_size: MIN_CHUNK_SIZE,
155 }
156 }
157}
158
159impl PdfIngesterConfig {
160 pub fn with_max_chunk_size(mut self, size: usize) -> Self {
162 self.max_chunk_size = size;
163 self
164 }
165
166 pub fn with_preserve_line_breaks(mut self, preserve: bool) -> Self {
168 self.preserve_line_breaks = preserve;
169 self
170 }
171
172 pub fn with_multi_column_handling(mut self, handle: bool) -> Self {
174 self.handle_multi_column = handle;
175 self
176 }
177}
178
179#[derive(Debug, Clone)]
181pub struct PdfIngester {
182 config: PdfIngesterConfig,
183}
184
185impl Default for PdfIngester {
186 fn default() -> Self {
187 Self::new()
188 }
189}
190
191impl PdfIngester {
192 pub fn new() -> Self {
194 Self {
195 config: PdfIngesterConfig::default(),
196 }
197 }
198
199 pub fn with_config(config: PdfIngesterConfig) -> Self {
201 Self { config }
202 }
203
204 pub fn ingest_file(&self, path: &Path) -> Result<PdfDocument> {
218 info!(?path, "Ingesting PDF file");
219
220 if !path.exists() {
222 return Err(Error::ingest(format!("File not found: {}", path.display())));
223 }
224
225 if path.extension().and_then(|e| e.to_str()) != Some("pdf") {
226 warn!(?path, "File does not have .pdf extension");
227 }
228
229 let data = fs::read(path).map_err(|e| {
231 Error::ingest(format!("Failed to read PDF file {}: {}", path.display(), e))
232 })?;
233
234 self.ingest_bytes(&data, path.to_path_buf())
235 }
236
237 pub fn ingest_bytes(&self, data: &[u8], path: PathBuf) -> Result<PdfDocument> {
248 let mut doc = PdfDocument::new(path.clone());
249
250 let text = pdf_extract::extract_text_from_mem(data).map_err(|e| {
252 Error::ingest(format!(
253 "Failed to extract text from PDF {}: {}",
254 path.display(),
255 e
256 ))
257 })?;
258
259 if self.config.extract_metadata {
261 self.extract_metadata_from_bytes(data, &mut doc);
262 }
263
264 let processed_text = self.process_text(&text);
266
267 doc.page_count = self.estimate_page_count(&text);
270
271 doc.chunks = self.create_chunks(&processed_text, doc.page_count);
273
274 info!(
275 path = %path.display(),
276 pages = doc.page_count,
277 chunks = doc.chunks.len(),
278 chars = doc.total_chars(),
279 "PDF ingestion complete"
280 );
281
282 Ok(doc)
283 }
284
285 fn extract_metadata_from_bytes(&self, data: &[u8], doc: &mut PdfDocument) {
287 let text = String::from_utf8_lossy(data);
290
291 if let Some(title) = self.extract_metadata_field(&text, "Title") {
293 doc.title = Some(title);
294 doc.metadata
295 .insert("title".to_string(), doc.title.clone().unwrap_or_default());
296 }
297
298 if let Some(author) = self.extract_metadata_field(&text, "Author") {
299 doc.author = Some(author);
300 doc.metadata
301 .insert("author".to_string(), doc.author.clone().unwrap_or_default());
302 }
303
304 if let Some(creator) = self.extract_metadata_field(&text, "Creator") {
305 doc.metadata.insert("creator".to_string(), creator);
306 }
307
308 if let Some(producer) = self.extract_metadata_field(&text, "Producer") {
309 doc.metadata.insert("producer".to_string(), producer);
310 }
311
312 if let Some(creation_date) = self.extract_metadata_field(&text, "CreationDate") {
313 doc.metadata
314 .insert("creation_date".to_string(), creation_date);
315 }
316
317 if let Some(mod_date) = self.extract_metadata_field(&text, "ModDate") {
318 doc.metadata
319 .insert("modification_date".to_string(), mod_date);
320 }
321
322 debug!(
323 title = ?doc.title,
324 author = ?doc.author,
325 metadata_count = doc.metadata.len(),
326 "Extracted PDF metadata"
327 );
328 }
329
330 fn extract_metadata_field(&self, text: &str, field: &str) -> Option<String> {
332 let pattern = format!("/{field}");
334 if let Some(pos) = text.find(&pattern) {
335 let after = &text[pos + pattern.len()..];
336
337 if let Some(start) = after.find('(') {
339 let value_start = start + 1;
340 let mut depth = 1;
341 let mut end = value_start;
342
343 for c in after[value_start..].chars() {
344 match c {
345 '(' => depth += 1,
346 ')' => {
347 depth -= 1;
348 if depth == 0 {
349 break;
350 }
351 }
352 _ => {}
353 }
354 end += c.len_utf8();
355 }
356
357 if end > value_start {
358 let value = &after[value_start..end];
359 let cleaned = value.trim().to_string();
360 if !cleaned.is_empty() && cleaned.len() < 500 {
361 return Some(cleaned);
362 }
363 }
364 }
365 }
366 None
367 }
368
369 fn process_text(&self, text: &str) -> String {
371 let mut result = String::with_capacity(text.len());
372
373 let processed = if self.config.handle_multi_column {
375 self.handle_multi_column_text(text)
376 } else {
377 text.to_string()
378 };
379
380 for line in processed.lines() {
382 let trimmed = line.trim();
383
384 if trimmed.is_empty() {
385 if !result.ends_with("\n\n") {
386 result.push('\n');
387 }
388 continue;
389 }
390
391 let is_continuation = !result.is_empty()
393 && !result.ends_with('\n')
394 && !trimmed.starts_with(char::is_uppercase)
395 && !trimmed.starts_with(|c: char| c.is_ascii_digit());
396
397 if is_continuation {
398 if result.ends_with('-') {
400 result.pop(); } else {
402 result.push(' ');
403 }
404 } else if !result.is_empty() && !result.ends_with('\n') {
405 if self.config.preserve_line_breaks {
406 result.push('\n');
407 } else {
408 result.push(' ');
409 }
410 }
411
412 result.push_str(trimmed);
413 }
414
415 self.normalize_whitespace(&result)
417 }
418
419 fn handle_multi_column_text(&self, text: &str) -> String {
421 let lines: Vec<&str> = text.lines().collect();
425 if lines.is_empty() {
426 return String::new();
427 }
428
429 let total_len: usize = lines.iter().map(|l| l.len()).sum();
431 let avg_len = total_len / lines.len().max(1);
432
433 if avg_len < 60 && lines.len() > 20 {
436 debug!(
437 avg_len,
438 lines = lines.len(),
439 "Detected potential multi-column layout"
440 );
441
442 let mut result = String::new();
445 let mut current_paragraph = String::new();
446
447 for line in lines {
448 let trimmed = line.trim();
449
450 if trimmed.is_empty() {
451 if !current_paragraph.is_empty() {
452 result.push_str(¤t_paragraph);
453 result.push_str("\n\n");
454 current_paragraph.clear();
455 }
456 continue;
457 }
458
459 let is_new_para = !current_paragraph.is_empty()
461 && (trimmed.starts_with(char::is_uppercase)
462 || trimmed.starts_with(|c: char| c.is_ascii_digit()));
463
464 if is_new_para && current_paragraph.ends_with('.') {
465 result.push_str(¤t_paragraph);
466 result.push_str("\n\n");
467 current_paragraph.clear();
468 }
469
470 if !current_paragraph.is_empty() {
471 if current_paragraph.ends_with('-') {
473 current_paragraph.pop();
474 } else {
475 current_paragraph.push(' ');
476 }
477 }
478 current_paragraph.push_str(trimmed);
479 }
480
481 if !current_paragraph.is_empty() {
482 result.push_str(¤t_paragraph);
483 }
484
485 result
486 } else {
487 text.to_string()
488 }
489 }
490
491 fn normalize_whitespace(&self, text: &str) -> String {
493 let mut result = String::with_capacity(text.len());
494 let mut prev_was_space = false;
495 let mut prev_was_newline = false;
496
497 for c in text.chars() {
498 if c == '\n' {
499 if !prev_was_newline {
500 result.push('\n');
501 prev_was_newline = true;
502 }
503 prev_was_space = false;
504 } else if c.is_whitespace() {
505 if !prev_was_space && !prev_was_newline {
506 result.push(' ');
507 prev_was_space = true;
508 }
509 } else {
510 result.push(c);
511 prev_was_space = false;
512 prev_was_newline = false;
513 }
514 }
515
516 result.trim().to_string()
517 }
518
519 fn estimate_page_count(&self, text: &str) -> usize {
521 let form_feeds = text.matches('\x0c').count();
523
524 if form_feeds > 0 {
525 form_feeds + 1
526 } else {
527 let chars = text.len();
529 (chars / 3000).max(1)
530 }
531 }
532
533 fn create_chunks(&self, text: &str, page_count: usize) -> Vec<PdfChunk> {
535 let mut chunks = Vec::new();
536
537 let pages: Vec<&str> = text.split('\x0c').collect();
539
540 if pages.len() > 1 {
541 for (page_idx, page_text) in pages.iter().enumerate() {
543 let page_chunks = Self::chunk_text(page_text, self.config.max_chunk_size);
544 for (chunk_idx, chunk_content) in page_chunks.into_iter().enumerate() {
545 if !chunk_content.trim().is_empty() {
546 chunks.push(PdfChunk::new(chunk_content, page_idx + 1, chunk_idx));
547 }
548 }
549 }
550 } else {
551 let all_chunks = Self::chunk_text(text, self.config.max_chunk_size);
553 let chunks_per_page = (all_chunks.len() / page_count).max(1);
554
555 for (idx, chunk_content) in all_chunks.into_iter().enumerate() {
556 if !chunk_content.trim().is_empty() {
557 let page_number = (idx / chunks_per_page).min(page_count - 1) + 1;
558 let chunk_index = idx % chunks_per_page;
559 chunks.push(PdfChunk::new(chunk_content, page_number, chunk_index));
560 }
561 }
562 }
563
564 self.merge_small_chunks(chunks)
566 }
567
568 fn merge_small_chunks(&self, chunks: Vec<PdfChunk>) -> Vec<PdfChunk> {
570 if chunks.is_empty() {
571 return chunks;
572 }
573
574 let mut result: Vec<PdfChunk> = Vec::new();
575
576 for chunk in chunks {
577 if chunk.content.len() < self.config.min_chunk_size {
578 if let Some(last) = result.last_mut() {
580 if last.page_number == chunk.page_number
581 && last.content.len() + chunk.content.len() < self.config.max_chunk_size
582 {
583 last.content.push_str("\n\n");
584 last.content.push_str(&chunk.content);
585 continue;
586 }
587 }
588 }
589 result.push(chunk);
590 }
591
592 result
593 }
594
595 pub fn chunk_text(text: &str, max_chunk_size: usize) -> Vec<String> {
606 if text.is_empty() {
607 return Vec::new();
608 }
609
610 if text.len() <= max_chunk_size {
611 return vec![text.to_string()];
612 }
613
614 let mut chunks = Vec::new();
615 let mut current_chunk = String::new();
616
617 let paragraphs: Vec<&str> = text.split("\n\n").collect();
619
620 for para in paragraphs {
621 let para_trimmed = para.trim();
622 if para_trimmed.is_empty() {
623 continue;
624 }
625
626 if !current_chunk.is_empty()
628 && current_chunk.len() + para_trimmed.len() + 2 > max_chunk_size
629 {
630 chunks.push(current_chunk.trim().to_string());
632 current_chunk = String::new();
633 }
634
635 if para_trimmed.len() > max_chunk_size {
637 if !current_chunk.is_empty() {
638 chunks.push(current_chunk.trim().to_string());
639 current_chunk = String::new();
640 }
641
642 let sentence_chunks = Self::chunk_by_sentences(para_trimmed, max_chunk_size);
643 chunks.extend(sentence_chunks);
644 } else {
645 if !current_chunk.is_empty() {
646 current_chunk.push_str("\n\n");
647 }
648 current_chunk.push_str(para_trimmed);
649 }
650 }
651
652 if !current_chunk.trim().is_empty() {
653 chunks.push(current_chunk.trim().to_string());
654 }
655
656 chunks
657 }
658
659 fn chunk_by_sentences(text: &str, max_chunk_size: usize) -> Vec<String> {
661 let mut chunks = Vec::new();
662 let mut current_chunk = String::new();
663
664 let sentence_endings = [". ", "! ", "? ", ".\n", "!\n", "?\n"];
666
667 let mut remaining = text;
668 while !remaining.is_empty() {
669 let mut best_split = remaining.len();
671
672 for ending in &sentence_endings {
673 if let Some(pos) = remaining.find(ending) {
674 let split_pos = pos + ending.len();
675 if split_pos < best_split {
676 best_split = split_pos;
677 }
678 }
679 }
680
681 let sentence = &remaining[..best_split];
682 remaining = &remaining[best_split..];
683
684 if !current_chunk.is_empty() && current_chunk.len() + sentence.len() > max_chunk_size {
686 chunks.push(current_chunk.trim().to_string());
687 current_chunk = String::new();
688 }
689
690 if sentence.len() > max_chunk_size {
692 if !current_chunk.is_empty() {
693 chunks.push(current_chunk.trim().to_string());
694 current_chunk = String::new();
695 }
696
697 let mut sent_remaining = sentence;
698 while !sent_remaining.is_empty() {
699 let split_at = max_chunk_size.min(sent_remaining.len());
700 let split_pos = if split_at < sent_remaining.len() {
702 sent_remaining[..split_at]
703 .rfind(' ')
704 .map(|p| p + 1)
705 .unwrap_or(split_at)
706 } else {
707 split_at
708 };
709
710 chunks.push(sent_remaining[..split_pos].trim().to_string());
711 sent_remaining = &sent_remaining[split_pos..];
712 }
713 } else {
714 current_chunk.push_str(sentence);
715 }
716 }
717
718 if !current_chunk.trim().is_empty() {
719 chunks.push(current_chunk.trim().to_string());
720 }
721
722 chunks
723 }
724
725 pub fn ingest_directory(&self, dir: &Path, recursive: bool) -> Result<Vec<PdfDocument>> {
740 info!(?dir, recursive, "Ingesting PDF files from directory");
741
742 if !dir.exists() {
743 return Err(Error::ingest(format!(
744 "Directory not found: {}",
745 dir.display()
746 )));
747 }
748
749 if !dir.is_dir() {
750 return Err(Error::ingest(format!(
751 "Path is not a directory: {}",
752 dir.display()
753 )));
754 }
755
756 let mut documents = Vec::new();
757 self.ingest_directory_recursive(dir, recursive, &mut documents)?;
758
759 info!(
760 dir = %dir.display(),
761 count = documents.len(),
762 "Directory ingestion complete"
763 );
764
765 Ok(documents)
766 }
767
768 fn ingest_directory_recursive(
770 &self,
771 dir: &Path,
772 recursive: bool,
773 documents: &mut Vec<PdfDocument>,
774 ) -> Result<()> {
775 let entries = fs::read_dir(dir).map_err(|e| {
776 Error::ingest(format!("Failed to read directory {}: {}", dir.display(), e))
777 })?;
778
779 for entry in entries {
780 let entry = entry.map_err(|e| Error::ingest(format!("Failed to read entry: {}", e)))?;
781 let path = entry.path();
782
783 if path.is_dir() {
784 if recursive {
785 self.ingest_directory_recursive(&path, recursive, documents)?;
786 }
787 } else if path.extension().and_then(|e| e.to_str()) == Some("pdf") {
788 match self.ingest_file(&path) {
789 Ok(doc) => documents.push(doc),
790 Err(e) => {
791 warn!(path = %path.display(), error = %e, "Failed to ingest PDF");
792 }
793 }
794 }
795 }
796
797 Ok(())
798 }
799}
800
801#[cfg(test)]
802mod tests {
803 use super::*;
804
805 #[test]
806 fn test_pdf_ingester_creation() {
807 let ingester = PdfIngester::new();
808 assert_eq!(ingester.config.max_chunk_size, DEFAULT_MAX_CHUNK_SIZE);
809 }
810
811 #[test]
812 fn test_pdf_ingester_with_config() {
813 let config = PdfIngesterConfig::default()
814 .with_max_chunk_size(2000)
815 .with_preserve_line_breaks(true);
816
817 let ingester = PdfIngester::with_config(config);
818 assert_eq!(ingester.config.max_chunk_size, 2000);
819 assert!(ingester.config.preserve_line_breaks);
820 }
821
822 #[test]
823 fn test_chunk_text_small() {
824 let text = "This is a small text.";
825 let chunks = PdfIngester::chunk_text(text, 4000);
826
827 assert_eq!(chunks.len(), 1);
828 assert_eq!(chunks[0], text);
829 }
830
831 #[test]
832 fn test_chunk_text_paragraphs() {
833 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
834 let chunks = PdfIngester::chunk_text(text, 30);
835
836 assert!(chunks.len() >= 2);
837 assert!(chunks[0].contains("First"));
838 }
839
840 #[test]
841 fn test_chunk_text_long_paragraph() {
842 let text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four. This is sentence five.";
843 let chunks = PdfIngester::chunk_text(text, 50);
844
845 assert!(chunks.len() > 1);
846 for chunk in &chunks {
847 assert!(chunk.len() <= 100); }
849 }
850
851 #[test]
852 fn test_chunk_text_empty() {
853 let chunks = PdfIngester::chunk_text("", 4000);
854 assert!(chunks.is_empty());
855 }
856
857 #[test]
858 fn test_pdf_document_new() {
859 let doc = PdfDocument::new(PathBuf::from("test.pdf"));
860
861 assert_eq!(doc.path, PathBuf::from("test.pdf"));
862 assert!(doc.title.is_none());
863 assert!(doc.author.is_none());
864 assert!(doc.chunks.is_empty());
865 assert_eq!(doc.page_count, 0);
866 }
867
868 #[test]
869 fn test_pdf_document_full_text() {
870 let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
871 doc.chunks
872 .push(PdfChunk::new("First chunk.".to_string(), 1, 0));
873 doc.chunks
874 .push(PdfChunk::new("Second chunk.".to_string(), 1, 1));
875
876 let full = doc.full_text();
877 assert!(full.contains("First chunk."));
878 assert!(full.contains("Second chunk."));
879 }
880
881 #[test]
882 fn test_pdf_document_total_chars() {
883 let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
884 doc.chunks.push(PdfChunk::new("Hello".to_string(), 1, 0));
885 doc.chunks.push(PdfChunk::new("World".to_string(), 2, 0));
886
887 assert_eq!(doc.total_chars(), 10);
888 }
889
890 #[test]
891 fn test_pdf_document_chunks_for_page() {
892 let mut doc = PdfDocument::new(PathBuf::from("test.pdf"));
893 doc.chunks
894 .push(PdfChunk::new("Page 1 chunk 1".to_string(), 1, 0));
895 doc.chunks
896 .push(PdfChunk::new("Page 1 chunk 2".to_string(), 1, 1));
897 doc.chunks
898 .push(PdfChunk::new("Page 2 chunk 1".to_string(), 2, 0));
899
900 let page1_chunks = doc.chunks_for_page(1);
901 assert_eq!(page1_chunks.len(), 2);
902
903 let page2_chunks = doc.chunks_for_page(2);
904 assert_eq!(page2_chunks.len(), 1);
905 }
906
907 #[test]
908 fn test_pdf_chunk_creation() {
909 let chunk = PdfChunk::new("Content".to_string(), 5, 2);
910
911 assert_eq!(chunk.content, "Content");
912 assert_eq!(chunk.page_number, 5);
913 assert_eq!(chunk.chunk_index, 2);
914 }
915
916 #[test]
917 fn test_pdf_chunk_is_empty() {
918 let empty_chunk = PdfChunk::new(" \n\t ".to_string(), 1, 0);
919 assert!(empty_chunk.is_empty());
920
921 let non_empty_chunk = PdfChunk::new("Content".to_string(), 1, 0);
922 assert!(!non_empty_chunk.is_empty());
923 }
924
925 #[test]
926 fn test_pdf_chunk_len() {
927 let chunk = PdfChunk::new("Hello World".to_string(), 1, 0);
928 assert_eq!(chunk.len(), 11);
929 }
930
931 #[test]
932 fn test_normalize_whitespace() {
933 let ingester = PdfIngester::new();
934
935 let text = "Hello world\n\n\n\nTest";
936 let normalized = ingester.normalize_whitespace(text);
937
938 assert!(!normalized.contains(" "));
939 assert!(!normalized.contains("\n\n\n"));
940 }
941
942 #[test]
943 fn test_estimate_page_count_with_form_feeds() {
944 let ingester = PdfIngester::new();
945
946 let text = "Page 1\x0cPage 2\x0cPage 3";
947 assert_eq!(ingester.estimate_page_count(text), 3);
948 }
949
950 #[test]
951 fn test_estimate_page_count_without_form_feeds() {
952 let ingester = PdfIngester::new();
953
954 let text = "a".repeat(6000);
956 assert_eq!(ingester.estimate_page_count(&text), 2);
957 }
958
959 #[test]
960 fn test_extract_metadata_field() {
961 let ingester = PdfIngester::new();
962
963 let pdf_content = "/Title (Test Document)";
964 let title = ingester.extract_metadata_field(pdf_content, "Title");
965 assert_eq!(title, Some("Test Document".to_string()));
966 }
967
968 #[test]
969 fn test_extract_metadata_field_with_nested_parens() {
970 let ingester = PdfIngester::new();
971
972 let pdf_content = "/Author (John (Jack) Doe)";
973 let author = ingester.extract_metadata_field(pdf_content, "Author");
974 assert_eq!(author, Some("John (Jack) Doe".to_string()));
975 }
976
977 #[test]
978 fn test_extract_metadata_field_not_found() {
979 let ingester = PdfIngester::new();
980
981 let pdf_content = "/Title (Test)";
982 let author = ingester.extract_metadata_field(pdf_content, "Author");
983 assert!(author.is_none());
984 }
985
986 #[test]
987 fn test_ingest_file_not_found() {
988 let ingester = PdfIngester::new();
989 let result = ingester.ingest_file(Path::new("/nonexistent/path/file.pdf"));
990
991 assert!(result.is_err());
992 let err_msg = result.unwrap_err().to_string();
993 assert!(err_msg.contains("not found") || err_msg.contains("Ingest"));
994 }
995
996 #[test]
997 fn test_ingest_directory_not_found() {
998 let ingester = PdfIngester::new();
999 let result = ingester.ingest_directory(Path::new("/nonexistent/directory"), false);
1000
1001 assert!(result.is_err());
1002 }
1003
1004 #[test]
1005 fn test_config_builder() {
1006 let config = PdfIngesterConfig::default()
1007 .with_max_chunk_size(5000)
1008 .with_preserve_line_breaks(true)
1009 .with_multi_column_handling(false);
1010
1011 assert_eq!(config.max_chunk_size, 5000);
1012 assert!(config.preserve_line_breaks);
1013 assert!(!config.handle_multi_column);
1014 }
1015
1016 #[test]
1017 fn test_create_chunks_with_form_feeds() {
1018 let ingester = PdfIngester::new();
1019
1020 let text = "Page one content.\x0cPage two content.\x0cPage three content.";
1021 let chunks = ingester.create_chunks(text, 3);
1022
1023 assert_eq!(chunks.len(), 3);
1024 assert_eq!(chunks[0].page_number, 1);
1025 assert_eq!(chunks[1].page_number, 2);
1026 assert_eq!(chunks[2].page_number, 3);
1027 }
1028
1029 #[test]
1030 fn test_handle_multi_column_short_lines() {
1031 let ingester = PdfIngester::new();
1032
1033 let lines: Vec<String> = (0..30).map(|i| format!("Line {}", i)).collect();
1035 let text = lines.join("\n");
1036
1037 let processed = ingester.handle_multi_column_text(&text);
1038 assert!(processed.lines().count() < 30 || processed.contains(' '));
1040 }
1041
1042 #[test]
1043 fn test_merge_small_chunks() {
1044 let ingester = PdfIngester::new();
1045
1046 let chunks = vec![
1047 PdfChunk::new("Small".to_string(), 1, 0),
1048 PdfChunk::new("Also small".to_string(), 1, 1),
1049 PdfChunk::new(
1050 "A much longer chunk that should not be merged.".to_string(),
1051 1,
1052 2,
1053 ),
1054 ];
1055
1056 let merged = ingester.merge_small_chunks(chunks);
1057 assert!(merged.len() <= 2);
1059 }
1060}