1use super::{
6 ColumnLayout, DocumentLayout, DocumentSection, DocumentType, LayoutAnalysisConfig,
7 LayoutAnalyzer, SectionType,
8};
9use crate::{RragError, RragResult};
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::path::Path;
13
14pub struct DefaultLayoutAnalyzer {
16 config: LayoutAnalysisConfig,
18
19 structure_detector: StructureDetector,
21
22 section_identifier: SectionIdentifier,
24
25 reading_order_analyzer: ReadingOrderAnalyzer,
27
28 column_detector: ColumnDetector,
30
31 page_analyzer: PageAnalyzer,
33}
34
35pub struct StructureDetector {
37 hierarchy_patterns: Vec<HierarchyPattern>,
39
40 type_classifiers: HashMap<DocumentType, TypeClassifier>,
42
43 layout_rules: Vec<LayoutRule>,
45}
46
47pub struct SectionIdentifier {
49 section_patterns: HashMap<DocumentType, Vec<SectionPattern>>,
51
52 header_rules: Vec<HeaderRule>,
54
55 content_classifier: ContentClassifier,
57}
58
59pub struct ReadingOrderAnalyzer {
61 strategies: HashMap<LayoutType, ReadingStrategy>,
63
64 flow_detector: FlowDetector,
66
67 region_analyzer: RegionAnalyzer,
69}
70
71pub struct ColumnDetector {
73 algorithms: Vec<ColumnDetectionAlgorithm>,
75
76 layout_classifier: LayoutClassifier,
78
79 spacing_analyzer: SpacingAnalyzer,
81}
82
83pub struct PageAnalyzer {
85 classifiers: HashMap<DocumentType, PageClassifier>,
87
88 distribution_analyzer: ContentDistributionAnalyzer,
90
91 margin_detector: MarginDetector,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct LayoutAnalysisResult {
98 pub layout: DocumentLayout,
100
101 pub confidence: f32,
103
104 pub processing_time_ms: u64,
106
107 pub metrics: LayoutMetrics,
109
110 pub features: LayoutFeatures,
112
113 pub warnings: Vec<String>,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct LayoutMetrics {
120 pub text_density: f32,
122
123 pub white_space_ratio: f32,
125
126 pub column_balance: f32,
128
129 pub reading_flow_score: f32,
131
132 pub organization_score: f32,
134}
135
136#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct LayoutFeatures {
139 pub has_headers_footers: bool,
141
142 pub has_columns: bool,
144
145 pub has_nested_sections: bool,
147
148 pub consistent_formatting: bool,
150
151 pub content_balance: ContentBalance,
153
154 pub complexity_level: ComplexityLevel,
156}
157
158#[derive(Debug, Clone)]
160pub struct HierarchyPattern {
161 pub id: String,
163
164 pub pattern: String,
166
167 pub level: usize,
169
170 pub weight: f32,
172
173 pub applicable_types: Vec<DocumentType>,
175}
176
177pub struct TypeClassifier {
179 rules: Vec<ClassificationRule>,
181
182 feature_extractors: Vec<FeatureExtractor>,
184
185 confidence_threshold: f32,
187}
188
189#[derive(Debug, Clone)]
191pub struct LayoutRule {
192 pub name: String,
194
195 pub condition: RuleCondition,
197
198 pub action: RuleAction,
200
201 pub priority: u32,
203}
204
205#[derive(Debug, Clone)]
207pub struct SectionPattern {
208 pub section_type: SectionType,
210
211 pub patterns: Vec<String>,
213
214 pub context_requirements: Vec<ContextRequirement>,
216
217 pub confidence: f32,
219}
220
221#[derive(Debug, Clone)]
223pub struct HeaderRule {
224 pub rule_type: HeaderRuleType,
226
227 pub criteria: String,
229
230 pub min_confidence: f32,
232}
233
234pub struct ContentClassifier {
236 models: HashMap<String, ClassificationModel>,
238
239 feature_extractors: Vec<TextFeatureExtractor>,
241}
242
243#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
245pub enum LayoutType {
246 SingleColumn,
247 MultiColumn,
248 Magazine,
249 Newspaper,
250 Academic,
251 Technical,
252 Web,
253}
254
255pub struct ReadingStrategy {
257 name: String,
259
260 flow_patterns: Vec<FlowPattern>,
262
263 priority_rules: Vec<PriorityRule>,
265}
266
267pub struct FlowDetector {
269 algorithms: Vec<FlowDetectionAlgorithm>,
271
272 pattern_matchers: Vec<FlowPatternMatcher>,
274}
275
276pub struct RegionAnalyzer {
278 classifiers: Vec<RegionClassifier>,
280
281 relationship_detectors: Vec<RelationshipDetector>,
283}
284
285#[derive(Debug, Clone)]
287pub struct ColumnDetectionAlgorithm {
288 pub name: String,
290
291 pub algorithm_type: ColumnAlgorithmType,
293
294 pub parameters: HashMap<String, f32>,
296}
297
298pub struct LayoutClassifier {
300 features: Vec<LayoutFeature>,
302
303 decision_trees: Vec<DecisionTree>,
305}
306
307pub struct SpacingAnalyzer {
309 metrics: Vec<SpacingMetric>,
311
312 threshold_calculator: ThresholdCalculator,
314}
315
316pub struct PageClassifier {
318 patterns: Vec<PagePattern>,
320
321 feature_weights: HashMap<String, f32>,
323}
324
325pub struct ContentDistributionAnalyzer {
327 metrics: Vec<DistributionMetric>,
329
330 balance_calculators: Vec<BalanceCalculator>,
332}
333
334pub struct MarginDetector {
336 methods: Vec<MarginDetectionMethod>,
338
339 consistency_checker: ConsistencyChecker,
341}
342
343impl DefaultLayoutAnalyzer {
344 pub fn new(config: LayoutAnalysisConfig) -> RragResult<Self> {
346 let structure_detector = StructureDetector::new()?;
347 let section_identifier = SectionIdentifier::new()?;
348 let reading_order_analyzer = ReadingOrderAnalyzer::new()?;
349 let column_detector = ColumnDetector::new()?;
350 let page_analyzer = PageAnalyzer::new()?;
351
352 Ok(Self {
353 config,
354 structure_detector,
355 section_identifier,
356 reading_order_analyzer,
357 column_detector,
358 page_analyzer,
359 })
360 }
361
362 pub async fn analyze_layout_comprehensive(
364 &self,
365 document_path: &Path,
366 ) -> RragResult<LayoutAnalysisResult> {
367 let start_time = std::time::Instant::now();
368
369 let content = self.extract_document_content(document_path).await?;
371
372 let structure = if self.config.detect_structure {
374 self.structure_detector.detect_structure(&content).await?
375 } else {
376 DocumentStructure::default()
377 };
378
379 let sections = if self.config.identify_sections {
381 self.section_identifier
382 .identify_sections(&content, &structure)
383 .await?
384 } else {
385 vec![]
386 };
387
388 let reading_order = if self.config.extract_reading_order {
390 self.reading_order_analyzer
391 .analyze_reading_order(&content, §ions)
392 .await?
393 } else {
394 (0..sections.len()).map(|i| i.to_string()).collect()
395 };
396
397 let columns = if self.config.detect_columns {
399 self.column_detector.detect_columns(&content).await?
400 } else {
401 None
402 };
403
404 let page_analysis = self.page_analyzer.analyze_pages(&content).await?;
406
407 let layout = DocumentLayout {
409 pages: page_analysis.page_count,
410 sections,
411 reading_order,
412 columns,
413 document_type: content.document_type,
414 };
415
416 let metrics = self.calculate_layout_metrics(&content, &layout)?;
418
419 let features = self.extract_layout_features(&content, &layout)?;
421
422 let confidence = self.calculate_analysis_confidence(&structure, &metrics, &features)?;
424
425 let processing_time = start_time.elapsed().as_millis() as u64;
426
427 Ok(LayoutAnalysisResult {
428 layout,
429 confidence,
430 processing_time_ms: processing_time,
431 metrics,
432 features,
433 warnings: vec![],
434 })
435 }
436
437 async fn extract_document_content(&self, document_path: &Path) -> RragResult<DocumentContent> {
439 let doc_type = self.detect_document_type(document_path)?;
441
442 match doc_type {
444 DocumentType::PDF => self.extract_pdf_content(document_path).await,
445 DocumentType::Word => self.extract_word_content(document_path).await,
446 DocumentType::HTML => self.extract_html_content(document_path).await,
447 DocumentType::Markdown => self.extract_markdown_content(document_path).await,
448 DocumentType::PlainText => self.extract_text_content(document_path).await,
449 _ => self.extract_generic_content(document_path).await,
450 }
451 }
452
453 fn calculate_layout_metrics(
455 &self,
456 content: &DocumentContent,
457 layout: &DocumentLayout,
458 ) -> RragResult<LayoutMetrics> {
459 let total_chars = content.text.len() as f32;
460 let total_area = content.page_width * content.page_height;
461
462 let text_density = total_chars / total_area;
464
465 let text_area = total_chars * 0.01; let white_space_ratio = 1.0 - (text_area / total_area).min(1.0);
468
469 let column_balance = if let Some(ref columns) = layout.columns {
471 self.calculate_column_balance(columns, &layout.sections)?
472 } else {
473 1.0
474 };
475
476 let reading_flow_score =
478 self.calculate_reading_flow_score(&layout.reading_order, &layout.sections)?;
479
480 let organization_score = self.calculate_organization_score(&layout.sections)?;
482
483 Ok(LayoutMetrics {
484 text_density,
485 white_space_ratio,
486 column_balance,
487 reading_flow_score,
488 organization_score,
489 })
490 }
491
492 fn extract_layout_features(
494 &self,
495 content: &DocumentContent,
496 layout: &DocumentLayout,
497 ) -> RragResult<LayoutFeatures> {
498 let has_headers_footers = content.has_headers || content.has_footers;
499 let has_columns = layout.columns.is_some();
500 let has_nested_sections = self.has_nested_sections(&layout.sections);
501 let consistent_formatting = self.check_formatting_consistency(content)?;
502 let content_balance = self.analyze_content_balance(content)?;
503 let complexity_level = self.assess_complexity_level(layout, content)?;
504
505 Ok(LayoutFeatures {
506 has_headers_footers,
507 has_columns,
508 has_nested_sections,
509 consistent_formatting,
510 content_balance,
511 complexity_level,
512 })
513 }
514
515 fn calculate_analysis_confidence(
517 &self,
518 structure: &DocumentStructure,
519 metrics: &LayoutMetrics,
520 features: &LayoutFeatures,
521 ) -> RragResult<f32> {
522 let mut confidence = 0.8; confidence *= structure.detection_confidence;
526
527 if metrics.organization_score > 0.8 {
529 confidence += 0.1;
530 }
531 if metrics.reading_flow_score > 0.8 {
532 confidence += 0.05;
533 }
534
535 if features.consistent_formatting {
537 confidence += 0.05;
538 }
539
540 Ok(confidence.min(1.0))
541 }
542
543 async fn extract_pdf_content(&self, _path: &Path) -> RragResult<DocumentContent> {
545 Ok(DocumentContent {
547 text: "PDF content".to_string(),
548 document_type: DocumentType::PDF,
549 page_count: 3,
550 page_width: 8.5,
551 page_height: 11.0,
552 has_headers: true,
553 has_footers: true,
554 formatting_info: FormattingInfo::default(),
555 })
556 }
557
558 async fn extract_word_content(&self, _path: &Path) -> RragResult<DocumentContent> {
559 Ok(DocumentContent {
560 text: "Word document content".to_string(),
561 document_type: DocumentType::Word,
562 page_count: 2,
563 page_width: 8.5,
564 page_height: 11.0,
565 has_headers: false,
566 has_footers: false,
567 formatting_info: FormattingInfo::default(),
568 })
569 }
570
571 async fn extract_html_content(&self, path: &Path) -> RragResult<DocumentContent> {
572 let html_content =
573 std::fs::read_to_string(path).map_err(|e| RragError::io_error(e.to_string()))?;
574
575 Ok(DocumentContent {
576 text: html_content,
577 document_type: DocumentType::HTML,
578 page_count: 1,
579 page_width: 12.0,
580 page_height: 16.0,
581 has_headers: false,
582 has_footers: false,
583 formatting_info: FormattingInfo::default(),
584 })
585 }
586
587 async fn extract_markdown_content(&self, path: &Path) -> RragResult<DocumentContent> {
588 let md_content =
589 std::fs::read_to_string(path).map_err(|e| RragError::io_error(e.to_string()))?;
590
591 Ok(DocumentContent {
592 text: md_content,
593 document_type: DocumentType::Markdown,
594 page_count: 1,
595 page_width: 10.0,
596 page_height: 12.0,
597 has_headers: false,
598 has_footers: false,
599 formatting_info: FormattingInfo::default(),
600 })
601 }
602
603 async fn extract_text_content(&self, path: &Path) -> RragResult<DocumentContent> {
604 let text_content =
605 std::fs::read_to_string(path).map_err(|e| RragError::io_error(e.to_string()))?;
606
607 Ok(DocumentContent {
608 text: text_content,
609 document_type: DocumentType::PlainText,
610 page_count: 1,
611 page_width: 8.0,
612 page_height: 10.0,
613 has_headers: false,
614 has_footers: false,
615 formatting_info: FormattingInfo::default(),
616 })
617 }
618
619 async fn extract_generic_content(&self, path: &Path) -> RragResult<DocumentContent> {
620 self.extract_text_content(path).await
621 }
622
623 fn detect_document_type(&self, file_path: &Path) -> RragResult<DocumentType> {
625 let extension = file_path
626 .extension()
627 .and_then(|ext| ext.to_str())
628 .unwrap_or("")
629 .to_lowercase();
630
631 match extension.as_str() {
632 "pdf" => Ok(DocumentType::PDF),
633 "doc" | "docx" => Ok(DocumentType::Word),
634 "ppt" | "pptx" => Ok(DocumentType::PowerPoint),
635 "html" | "htm" => Ok(DocumentType::HTML),
636 "md" => Ok(DocumentType::Markdown),
637 "txt" => Ok(DocumentType::PlainText),
638 _ => Ok(DocumentType::Mixed),
639 }
640 }
641
642 fn calculate_column_balance(
643 &self,
644 columns: &ColumnLayout,
645 sections: &[DocumentSection],
646 ) -> RragResult<f32> {
647 if columns.column_count <= 1 {
648 return Ok(1.0);
649 }
650
651 let mut column_content_lengths = vec![0; columns.column_count];
653
654 for section in sections {
655 let content_per_column = section.content.len() / columns.column_count;
657 for i in 0..columns.column_count {
658 column_content_lengths[i] += content_per_column;
659 }
660 }
661
662 let total_content: usize = column_content_lengths.iter().sum();
664 let mean_content = total_content as f32 / columns.column_count as f32;
665
666 let variance = column_content_lengths
667 .iter()
668 .map(|&len| (len as f32 - mean_content).powi(2))
669 .sum::<f32>()
670 / columns.column_count as f32;
671
672 let balance = 1.0 / (1.0 + variance / (mean_content * mean_content));
673 Ok(balance)
674 }
675
676 fn calculate_reading_flow_score(
677 &self,
678 reading_order: &[String],
679 sections: &[DocumentSection],
680 ) -> RragResult<f32> {
681 if reading_order.len() != sections.len() {
682 return Ok(0.5); }
684
685 let mut flow_score: f32 = 1.0;
687 let mut has_title = false;
688 let mut _has_abstract = false;
689 let mut has_intro = false;
690 let mut has_conclusion = false;
691
692 for section_id in reading_order {
693 if let Some(section) = sections.iter().find(|s| s.id == *section_id) {
694 match section.section_type {
695 SectionType::Title => has_title = true,
696 SectionType::Abstract => {
697 if !has_title {
698 flow_score -= 0.1; }
700 _has_abstract = true;
701 }
702 SectionType::Introduction => {
703 if has_conclusion {
704 flow_score -= 0.2; }
706 has_intro = true;
707 }
708 SectionType::Conclusion => has_conclusion = true,
709 _ => {}
710 }
711 }
712 }
713
714 if has_title {
716 flow_score += 0.1;
717 }
718 if has_intro {
719 flow_score += 0.1;
720 }
721 if has_conclusion {
722 flow_score += 0.1;
723 }
724
725 Ok(flow_score.max(0.0).min(1.0))
726 }
727
728 fn calculate_organization_score(&self, sections: &[DocumentSection]) -> RragResult<f32> {
729 if sections.is_empty() {
730 return Ok(0.0);
731 }
732
733 let mut score = 0.8; let has_hierarchy = sections.iter().any(|s| s.level > 1);
737 if has_hierarchy {
738 score += 0.1;
739 }
740
741 let section_types: std::collections::HashSet<SectionType> =
743 sections.iter().map(|s| s.section_type).collect();
744
745 let type_diversity = section_types.len() as f32 / 6.0; score += type_diversity * 0.1;
747
748 Ok(score.min(1.0))
749 }
750
751 fn has_nested_sections(&self, sections: &[DocumentSection]) -> bool {
752 sections.iter().any(|s| s.level > 1)
753 }
754
755 fn check_formatting_consistency(&self, content: &DocumentContent) -> RragResult<bool> {
756 Ok(content.formatting_info.has_consistent_fonts
758 && content.formatting_info.has_consistent_spacing)
759 }
760
761 fn analyze_content_balance(&self, content: &DocumentContent) -> RragResult<ContentBalance> {
762 let text_length = content.text.len();
763
764 if text_length > 10000 {
766 Ok(ContentBalance::TextHeavy)
767 } else if text_length < 1000 {
768 Ok(ContentBalance::VisualHeavy)
769 } else {
770 Ok(ContentBalance::Balanced)
771 }
772 }
773
774 fn assess_complexity_level(
775 &self,
776 layout: &DocumentLayout,
777 content: &DocumentContent,
778 ) -> RragResult<ComplexityLevel> {
779 let mut complexity_score = 0;
780
781 complexity_score += layout.sections.len();
783
784 if let Some(ref columns) = layout.columns {
786 complexity_score += columns.column_count * 2;
787 }
788
789 let max_level = layout.sections.iter().map(|s| s.level).max().unwrap_or(1);
791 complexity_score += max_level * 2;
792
793 complexity_score += (content.text.len() / 1000).min(10);
795
796 match complexity_score {
797 0..=5 => Ok(ComplexityLevel::Simple),
798 6..=15 => Ok(ComplexityLevel::Moderate),
799 16..=25 => Ok(ComplexityLevel::Complex),
800 _ => Ok(ComplexityLevel::VeryComplex),
801 }
802 }
803}
804
805impl LayoutAnalyzer for DefaultLayoutAnalyzer {
806 fn analyze_layout(&self, document_path: &Path) -> RragResult<DocumentLayout> {
807 let content = DocumentContent {
809 text: "Sample content".to_string(),
810 document_type: self.detect_document_type(document_path)?,
811 page_count: 1,
812 page_width: 8.5,
813 page_height: 11.0,
814 has_headers: false,
815 has_footers: false,
816 formatting_info: FormattingInfo::default(),
817 };
818
819 let sections = vec![DocumentSection {
820 id: "section_0".to_string(),
821 title: Some("Main Content".to_string()),
822 content: content.text.clone(),
823 section_type: SectionType::Body,
824 level: 1,
825 page_range: (1, 1),
826 }];
827
828 Ok(DocumentLayout {
829 pages: content.page_count,
830 sections,
831 reading_order: vec!["section_0".to_string()],
832 columns: None,
833 document_type: content.document_type,
834 })
835 }
836
837 fn detect_sections(&self, content: &str) -> RragResult<Vec<DocumentSection>> {
838 let sections = vec![DocumentSection {
840 id: "section_0".to_string(),
841 title: None,
842 content: content.to_string(),
843 section_type: SectionType::Body,
844 level: 1,
845 page_range: (1, 1),
846 }];
847
848 Ok(sections)
849 }
850
851 fn extract_reading_order(&self, layout: &DocumentLayout) -> RragResult<Vec<String>> {
852 Ok(layout.sections.iter().map(|s| s.id.clone()).collect())
853 }
854}
855
856#[derive(Debug, Clone)]
858pub struct DocumentContent {
859 pub text: String,
860 pub document_type: DocumentType,
861 pub page_count: usize,
862 pub page_width: f32,
863 pub page_height: f32,
864 pub has_headers: bool,
865 pub has_footers: bool,
866 pub formatting_info: FormattingInfo,
867}
868
869#[derive(Debug, Clone)]
870pub struct DocumentStructure {
871 pub detection_confidence: f32,
872 pub hierarchy_levels: Vec<HierarchyLevel>,
873 pub structural_elements: Vec<StructuralElement>,
874}
875
876#[derive(Debug, Clone)]
877pub struct HierarchyLevel {
878 pub level: usize,
879 pub elements: Vec<String>,
880}
881
882#[derive(Debug, Clone)]
883pub struct StructuralElement {
884 pub element_type: String,
885 pub position: ElementPosition,
886 pub properties: HashMap<String, String>,
887}
888
889#[derive(Debug, Clone)]
890pub struct ElementPosition {
891 pub page: usize,
892 pub x: f32,
893 pub y: f32,
894 pub width: f32,
895 pub height: f32,
896}
897
898#[derive(Debug, Clone)]
899pub struct FormattingInfo {
900 pub has_consistent_fonts: bool,
901 pub has_consistent_spacing: bool,
902 pub has_consistent_colors: bool,
903 pub font_families: Vec<String>,
904 pub font_sizes: Vec<f32>,
905}
906
907#[derive(Debug, Clone)]
908pub struct PageAnalysis {
909 pub page_count: usize,
910 pub page_types: Vec<PageType>,
911 pub content_distribution: ContentDistribution,
912}
913
914#[derive(Debug, Clone)]
915pub struct ContentDistribution {
916 pub text_percentage: f32,
917 pub image_percentage: f32,
918 pub table_percentage: f32,
919 pub whitespace_percentage: f32,
920}
921
922#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
924pub enum ContentBalance {
925 TextHeavy,
926 VisualHeavy,
927 Balanced,
928}
929
930#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
931pub enum ComplexityLevel {
932 Simple,
933 Moderate,
934 Complex,
935 VeryComplex,
936}
937
938#[derive(Debug, Clone, Copy)]
939pub enum PageType {
940 TitlePage,
941 ContentPage,
942 TOCPage,
943 IndexPage,
944 AppendixPage,
945}
946
947#[derive(Debug, Clone, Copy)]
948pub enum HeaderRuleType {
949 FontSize,
950 FontWeight,
951 Capitalization,
952 Positioning,
953 Numbering,
954}
955
956#[derive(Debug, Clone, Copy)]
957pub enum ColumnAlgorithmType {
958 WhitespaceAnalysis,
959 TextBlockAlignment,
960 StatisticalClustering,
961 GeometricDetection,
962}
963
964#[derive(Debug, Clone)]
965pub enum RuleCondition {
966 TextPattern(String),
967 FontSize(f32),
968 Position(f32, f32),
969 ContentLength(usize),
970}
971
972#[derive(Debug, Clone)]
973pub enum RuleAction {
974 ClassifyAsSection(SectionType),
975 SetHierarchyLevel(usize),
976 MarkAsHeader,
977 MarkAsFooter,
978}
979
980impl StructureDetector {
982 pub fn new() -> RragResult<Self> {
983 Ok(Self {
984 hierarchy_patterns: vec![],
985 type_classifiers: HashMap::new(),
986 layout_rules: vec![],
987 })
988 }
989
990 pub async fn detect_structure(
991 &self,
992 _content: &DocumentContent,
993 ) -> RragResult<DocumentStructure> {
994 Ok(DocumentStructure {
995 detection_confidence: 0.8,
996 hierarchy_levels: vec![],
997 structural_elements: vec![],
998 })
999 }
1000}
1001
1002impl SectionIdentifier {
1003 pub fn new() -> RragResult<Self> {
1004 Ok(Self {
1005 section_patterns: HashMap::new(),
1006 header_rules: vec![],
1007 content_classifier: ContentClassifier::new(),
1008 })
1009 }
1010
1011 pub async fn identify_sections(
1012 &self,
1013 content: &DocumentContent,
1014 _structure: &DocumentStructure,
1015 ) -> RragResult<Vec<DocumentSection>> {
1016 Ok(vec![DocumentSection {
1017 id: "section_0".to_string(),
1018 title: Some("Main Content".to_string()),
1019 content: content.text.clone(),
1020 section_type: SectionType::Body,
1021 level: 1,
1022 page_range: (1, content.page_count),
1023 }])
1024 }
1025}
1026
1027impl ReadingOrderAnalyzer {
1028 pub fn new() -> RragResult<Self> {
1029 Ok(Self {
1030 strategies: HashMap::new(),
1031 flow_detector: FlowDetector::new(),
1032 region_analyzer: RegionAnalyzer::new(),
1033 })
1034 }
1035
1036 pub async fn analyze_reading_order(
1037 &self,
1038 _content: &DocumentContent,
1039 sections: &[DocumentSection],
1040 ) -> RragResult<Vec<String>> {
1041 Ok(sections.iter().map(|s| s.id.clone()).collect())
1042 }
1043}
1044
1045impl ColumnDetector {
1046 pub fn new() -> RragResult<Self> {
1047 Ok(Self {
1048 algorithms: vec![],
1049 layout_classifier: LayoutClassifier::new(),
1050 spacing_analyzer: SpacingAnalyzer::new(),
1051 })
1052 }
1053
1054 pub async fn detect_columns(
1055 &self,
1056 content: &DocumentContent,
1057 ) -> RragResult<Option<ColumnLayout>> {
1058 if content.page_width > 10.0 && content.text.len() > 5000 {
1060 Ok(Some(ColumnLayout {
1061 column_count: 2,
1062 column_widths: vec![0.48, 0.48],
1063 gutter_width: 0.04,
1064 }))
1065 } else {
1066 Ok(None)
1067 }
1068 }
1069}
1070
1071impl PageAnalyzer {
1072 pub fn new() -> RragResult<Self> {
1073 Ok(Self {
1074 classifiers: HashMap::new(),
1075 distribution_analyzer: ContentDistributionAnalyzer::new(),
1076 margin_detector: MarginDetector::new(),
1077 })
1078 }
1079
1080 pub async fn analyze_pages(&self, content: &DocumentContent) -> RragResult<PageAnalysis> {
1081 Ok(PageAnalysis {
1082 page_count: content.page_count,
1083 page_types: vec![PageType::ContentPage; content.page_count],
1084 content_distribution: ContentDistribution {
1085 text_percentage: 0.8,
1086 image_percentage: 0.1,
1087 table_percentage: 0.05,
1088 whitespace_percentage: 0.05,
1089 },
1090 })
1091 }
1092}
1093
1094impl Default for DocumentStructure {
1096 fn default() -> Self {
1097 Self {
1098 detection_confidence: 0.5,
1099 hierarchy_levels: vec![],
1100 structural_elements: vec![],
1101 }
1102 }
1103}
1104
1105impl Default for FormattingInfo {
1106 fn default() -> Self {
1107 Self {
1108 has_consistent_fonts: true,
1109 has_consistent_spacing: true,
1110 has_consistent_colors: true,
1111 font_families: vec!["Arial".to_string()],
1112 font_sizes: vec![12.0],
1113 }
1114 }
1115}
1116
1117impl ContentClassifier {
1119 pub fn new() -> Self {
1120 Self {
1121 models: HashMap::new(),
1122 feature_extractors: Vec::new(),
1123 }
1124 }
1125}
1126
1127impl FlowDetector {
1128 pub fn new() -> Self {
1129 Self {
1130 algorithms: Vec::new(),
1131 pattern_matchers: Vec::new(),
1132 }
1133 }
1134}
1135
1136impl RegionAnalyzer {
1137 pub fn new() -> Self {
1138 Self {
1139 classifiers: Vec::new(),
1140 relationship_detectors: Vec::new(),
1141 }
1142 }
1143}
1144
1145impl LayoutClassifier {
1146 pub fn new() -> Self {
1147 Self {
1148 features: Vec::new(),
1149 decision_trees: Vec::new(),
1150 }
1151 }
1152}
1153
1154impl SpacingAnalyzer {
1155 pub fn new() -> Self {
1156 Self {
1157 metrics: Vec::new(),
1158 threshold_calculator: ThresholdCalculator::new(),
1159 }
1160 }
1161}
1162
1163impl ContentDistributionAnalyzer {
1164 pub fn new() -> Self {
1165 Self {
1166 metrics: Vec::new(),
1167 balance_calculators: Vec::new(),
1168 }
1169 }
1170}
1171
1172impl MarginDetector {
1173 pub fn new() -> Self {
1174 Self {
1175 methods: Vec::new(),
1176 consistency_checker: ConsistencyChecker::new(),
1177 }
1178 }
1179}
1180
1181pub struct ClassificationRule;
1183pub struct FeatureExtractor;
1184pub struct ClassificationModel;
1185pub struct TextFeatureExtractor;
1186pub struct FlowPattern;
1187pub struct PriorityRule;
1188pub struct FlowDetectionAlgorithm;
1189pub struct FlowPatternMatcher;
1190pub struct RegionClassifier;
1191pub struct RelationshipDetector;
1192pub struct LayoutFeature;
1193pub struct DecisionTree;
1194pub struct SpacingMetric;
1195pub struct ThresholdCalculator;
1196pub struct PagePattern;
1197pub struct DistributionMetric;
1198pub struct BalanceCalculator;
1199pub struct MarginDetectionMethod;
1200pub struct ConsistencyChecker;
1201
1202impl ThresholdCalculator {
1203 pub fn new() -> Self {
1204 Self
1205 }
1206}
1207
1208impl ConsistencyChecker {
1209 pub fn new() -> Self {
1210 Self
1211 }
1212}
1213#[derive(Debug, Clone)]
1214pub struct ContextRequirement;
1215
1216#[cfg(test)]
1217mod tests {
1218 use super::*;
1219 use tempfile::NamedTempFile;
1220
1221 #[test]
1222 fn test_layout_analyzer_creation() {
1223 let config = LayoutAnalysisConfig::default();
1224 let analyzer = DefaultLayoutAnalyzer::new(config).unwrap();
1225
1226 assert!(analyzer.config.detect_structure);
1227 assert!(analyzer.config.identify_sections);
1228 }
1229
1230 #[test]
1231 fn test_document_type_detection() {
1232 let config = LayoutAnalysisConfig::default();
1233 let analyzer = DefaultLayoutAnalyzer::new(config).unwrap();
1234
1235 let pdf_path = std::path::Path::new("test.pdf");
1236 assert_eq!(
1237 analyzer.detect_document_type(pdf_path).unwrap(),
1238 DocumentType::PDF
1239 );
1240
1241 let md_path = std::path::Path::new("test.md");
1242 assert_eq!(
1243 analyzer.detect_document_type(md_path).unwrap(),
1244 DocumentType::Markdown
1245 );
1246 }
1247
1248 #[test]
1249 fn test_content_balance_analysis() {
1250 let config = LayoutAnalysisConfig::default();
1251 let analyzer = DefaultLayoutAnalyzer::new(config).unwrap();
1252
1253 let short_content = DocumentContent {
1254 text: "Short".to_string(),
1255 document_type: DocumentType::PlainText,
1256 page_count: 1,
1257 page_width: 8.5,
1258 page_height: 11.0,
1259 has_headers: false,
1260 has_footers: false,
1261 formatting_info: FormattingInfo::default(),
1262 };
1263
1264 let balance = analyzer.analyze_content_balance(&short_content).unwrap();
1265 assert!(matches!(balance, ContentBalance::VisualHeavy));
1266 }
1267
1268 #[test]
1269 fn test_complexity_assessment() {
1270 let config = LayoutAnalysisConfig::default();
1271 let analyzer = DefaultLayoutAnalyzer::new(config).unwrap();
1272
1273 let simple_layout = DocumentLayout {
1274 pages: 1,
1275 sections: vec![],
1276 reading_order: vec![],
1277 columns: None,
1278 document_type: DocumentType::PlainText,
1279 };
1280
1281 let simple_content = DocumentContent {
1282 text: "Simple content".to_string(),
1283 document_type: DocumentType::PlainText,
1284 page_count: 1,
1285 page_width: 8.5,
1286 page_height: 11.0,
1287 has_headers: false,
1288 has_footers: false,
1289 formatting_info: FormattingInfo::default(),
1290 };
1291
1292 let complexity = analyzer
1293 .assess_complexity_level(&simple_layout, &simple_content)
1294 .unwrap();
1295 assert!(matches!(complexity, ComplexityLevel::Simple));
1296 }
1297}