Skip to main content

oxidize_pdf/text/structured/
detector.rs

1//! Main detection engine for structured data extraction.
2
3use super::keyvalue;
4use super::layout;
5use super::table;
6use super::types::{StructuredDataConfig, StructuredDataResult};
7use crate::text::extraction::TextFragment;
8
9/// Main detector for structured data patterns in PDF text.
10///
11/// This detector analyzes text fragments to identify:
12/// - Tables (using spatial clustering)
13/// - Key-value pairs (using pattern matching)
14/// - Multi-column layouts (using gap analysis)
15///
16/// # Examples
17///
18/// ```rust,no_run
19/// use oxidize_pdf::text::structured::{StructuredDataDetector, StructuredDataConfig};
20/// use oxidize_pdf::text::extraction::TextFragment;
21///
22/// let config = StructuredDataConfig::default();
23/// let detector = StructuredDataDetector::new(config);
24///
25/// let fragments: Vec<TextFragment> = vec![]; // from PDF extraction
26/// let result = detector.detect(&fragments)?;
27///
28/// for table in &result.tables {
29///     println!("Table: {}x{} rows (confidence: {:.2})",
30///         table.row_count(), table.column_count(), table.confidence);
31/// }
32/// # Ok::<(), Box<dyn std::error::Error>>(())
33/// ```
34#[derive(Debug, Clone)]
35pub struct StructuredDataDetector {
36    config: StructuredDataConfig,
37}
38
39impl StructuredDataDetector {
40    /// Creates a new detector with the given configuration.
41    pub fn new(config: StructuredDataConfig) -> Self {
42        Self { config }
43    }
44
45    /// Creates a new detector with default configuration.
46    pub fn default() -> Self {
47        Self::new(StructuredDataConfig::default())
48    }
49
50    /// Detects structured data patterns in the given text fragments.
51    ///
52    /// This is the main entry point for structured data extraction.
53    /// It analyzes the text fragments and returns all detected patterns.
54    ///
55    /// # Arguments
56    ///
57    /// * `fragments` - Text fragments extracted from a PDF page
58    ///
59    /// # Returns
60    ///
61    /// A `StructuredDataResult` containing all detected patterns.
62    ///
63    /// # Errors
64    ///
65    /// Returns an error if the detection algorithms fail (currently infallible).
66    pub fn detect(&self, fragments: &[TextFragment]) -> Result<StructuredDataResult, String> {
67        let mut result = StructuredDataResult::new();
68
69        // Skip empty input
70        if fragments.is_empty() {
71            return Ok(result);
72        }
73
74        // Detect tables
75        if self.config.detect_tables {
76            result.tables = table::detect_tables(fragments, &self.config);
77        }
78
79        // Detect key-value pairs
80        if self.config.detect_key_value {
81            result.key_value_pairs = keyvalue::detect_key_value_pairs(fragments, &self.config);
82        }
83
84        // Detect multi-column layouts
85        if self.config.detect_multi_column {
86            result.column_sections = layout::detect_column_layout(fragments, &self.config);
87        }
88
89        Ok(result)
90    }
91
92    /// Gets the current configuration.
93    pub fn config(&self) -> &StructuredDataConfig {
94        &self.config
95    }
96
97    /// Updates the configuration.
98    pub fn set_config(&mut self, config: StructuredDataConfig) {
99        self.config = config;
100    }
101}
102
103impl Default for StructuredDataDetector {
104    fn default() -> Self {
105        Self::new(StructuredDataConfig::default())
106    }
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn test_detector_creation() {
115        let detector = StructuredDataDetector::default();
116        assert!(detector.config().detect_tables);
117        assert!(detector.config().detect_key_value);
118        assert!(detector.config().detect_multi_column);
119    }
120
121    #[test]
122    fn test_detector_empty_input() {
123        let detector = StructuredDataDetector::default();
124        let result = detector
125            .detect(&[])
126            .expect("detector should handle empty input");
127
128        assert_eq!(result.tables.len(), 0);
129        assert_eq!(result.key_value_pairs.len(), 0);
130        assert_eq!(result.column_sections.len(), 0);
131    }
132
133    #[test]
134    fn test_detector_config_update() {
135        let mut detector = StructuredDataDetector::default();
136
137        let mut config = StructuredDataConfig::default();
138        config.detect_tables = false;
139
140        detector.set_config(config);
141
142        assert!(!detector.config().detect_tables);
143    }
144
145    #[test]
146    fn test_detector_selective_detection() {
147        let config = StructuredDataConfig::default()
148            .with_table_detection(false)
149            .with_key_value_detection(true)
150            .with_multi_column_detection(false);
151
152        let detector = StructuredDataDetector::new(config);
153
154        // Create simple text fragments
155        let fragments = vec![TextFragment {
156            text: "Name: John".to_string(),
157            x: 100.0,
158            y: 700.0,
159            width: 50.0,
160            height: 12.0,
161            font_size: 12.0,
162            font_name: None,
163            is_bold: false,
164            is_italic: false,
165            color: None,
166        }];
167
168        let result = detector
169            .detect(&fragments)
170            .expect("detect should succeed with valid input");
171
172        // Tables disabled, so should be empty
173        assert_eq!(result.tables.len(), 0);
174        // Key-value enabled, might detect the pattern
175        // (actual detection tested in keyvalue module)
176        // Multi-column disabled
177        assert_eq!(result.column_sections.len(), 0);
178    }
179}