Skip to main content

memvid_core/table/
layout.rs

1//! PDF layout extraction utilities.
2//!
3//! This module extracts text boxes and line segments from PDFs,
4//! providing the raw layout data needed for table detection.
5
6use crate::error::{MemvidError, Result};
7
8/// A text box with position information from a PDF.
9#[derive(Debug, Clone)]
10pub struct TextBox {
11    /// Text content
12    pub text: String,
13    /// Left edge X coordinate (in points)
14    pub x: f32,
15    /// Bottom edge Y coordinate (in points, PDF coordinates)
16    pub y: f32,
17    /// Width in points
18    pub width: f32,
19    /// Height in points
20    pub height: f32,
21    /// Font size (if available)
22    pub font_size: f32,
23    /// Page number (1-indexed)
24    pub page: u32,
25}
26
27impl TextBox {
28    /// Get the right edge X coordinate.
29    #[must_use]
30    pub fn right(&self) -> f32 {
31        self.x + self.width
32    }
33
34    /// Get the top edge Y coordinate.
35    #[must_use]
36    pub fn top(&self) -> f32 {
37        self.y + self.height
38    }
39
40    /// Get the center X coordinate.
41    #[must_use]
42    pub fn center_x(&self) -> f32 {
43        self.x + self.width / 2.0
44    }
45
46    /// Get the center Y coordinate.
47    #[must_use]
48    pub fn center_y(&self) -> f32 {
49        self.y + self.height / 2.0
50    }
51
52    /// Check if this text box overlaps with another.
53    #[must_use]
54    pub fn overlaps(&self, other: &Self) -> bool {
55        self.x < other.right()
56            && self.right() > other.x
57            && self.y < other.top()
58            && self.top() > other.y
59    }
60}
61
62/// A line segment from PDF path objects (for Lattice detection).
63#[derive(Debug, Clone)]
64pub struct LineSegment {
65    /// Start X coordinate
66    pub x1: f32,
67    /// Start Y coordinate
68    pub y1: f32,
69    /// End X coordinate
70    pub x2: f32,
71    /// End Y coordinate
72    pub y2: f32,
73    /// Page number (1-indexed)
74    pub page: u32,
75}
76
77impl LineSegment {
78    /// Check if this is a horizontal line (within tolerance).
79    #[must_use]
80    pub fn is_horizontal(&self, tolerance: f32) -> bool {
81        (self.y1 - self.y2).abs() <= tolerance
82    }
83
84    /// Check if this is a vertical line (within tolerance).
85    #[must_use]
86    pub fn is_vertical(&self, tolerance: f32) -> bool {
87        (self.x1 - self.x2).abs() <= tolerance
88    }
89
90    /// Get the length of this line segment.
91    #[must_use]
92    pub fn length(&self) -> f32 {
93        ((self.x2 - self.x1).powi(2) + (self.y2 - self.y1).powi(2)).sqrt()
94    }
95
96    /// Get the Y coordinate for horizontal lines.
97    #[must_use]
98    pub fn y_coord(&self) -> f32 {
99        f32::midpoint(self.y1, self.y2)
100    }
101
102    /// Get the X coordinate for vertical lines.
103    #[must_use]
104    pub fn x_coord(&self) -> f32 {
105        f32::midpoint(self.x1, self.x2)
106    }
107}
108
109/// Layout information for a single page.
110#[derive(Debug, Clone)]
111pub struct PageLayout {
112    /// Page number (1-indexed)
113    pub page_number: u32,
114    /// Page width in points
115    pub width: f32,
116    /// Page height in points
117    pub height: f32,
118    /// Text boxes on this page
119    pub text_boxes: Vec<TextBox>,
120    /// Line segments on this page (for Lattice detection)
121    pub lines: Vec<LineSegment>,
122}
123
124impl PageLayout {
125    /// Create an empty page layout.
126    #[must_use]
127    pub fn new(page_number: u32, width: f32, height: f32) -> Self {
128        Self {
129            page_number,
130            width,
131            height,
132            text_boxes: Vec::new(),
133            lines: Vec::new(),
134        }
135    }
136
137    /// Check if the page has any content.
138    #[must_use]
139    pub fn is_empty(&self) -> bool {
140        self.text_boxes.is_empty() && self.lines.is_empty()
141    }
142
143    /// Get horizontal lines (filtered by tolerance).
144    #[must_use]
145    pub fn horizontal_lines(&self, tolerance: f32) -> Vec<&LineSegment> {
146        self.lines
147            .iter()
148            .filter(|l| l.is_horizontal(tolerance))
149            .collect()
150    }
151
152    /// Get vertical lines (filtered by tolerance).
153    #[must_use]
154    pub fn vertical_lines(&self, tolerance: f32) -> Vec<&LineSegment> {
155        self.lines
156            .iter()
157            .filter(|l| l.is_vertical(tolerance))
158            .collect()
159    }
160
161    /// Check if this page likely has ruled tables (significant line count).
162    #[must_use]
163    pub fn has_ruled_structure(&self, min_lines: usize, tolerance: f32) -> bool {
164        let h_count = self.horizontal_lines(tolerance).len();
165        let v_count = self.vertical_lines(tolerance).len();
166        h_count >= min_lines && v_count >= min_lines
167    }
168}
169
170/// Extract layout information from a PDF using pdfium.
171///
172/// This is the primary extraction path when the `pdfium` feature is enabled.
173#[cfg(feature = "pdfium")]
174pub fn extract_pdf_layout(bytes: &[u8], max_pages: usize) -> Result<Vec<PageLayout>> {
175    use pdfium_render::prelude::*;
176
177    let pdfium = Pdfium::default();
178    let document =
179        pdfium
180            .load_pdf_from_byte_slice(bytes, None)
181            .map_err(|e| MemvidError::TableExtraction {
182                reason: format!("failed to load PDF: {e}"),
183            })?;
184
185    let page_count = document.pages().len() as usize;
186    let max_pages_usize = max_pages as usize;
187    let pages_to_process = if max_pages_usize > 0 {
188        page_count.min(max_pages_usize)
189    } else {
190        page_count
191    };
192
193    let mut layouts = Vec::with_capacity(pages_to_process);
194
195    for page_idx in 0..pages_to_process {
196        let page =
197            document
198                .pages()
199                .get(page_idx as u16)
200                .map_err(|e| MemvidError::TableExtraction {
201                    reason: format!("failed to get page {}: {e}", page_idx + 1),
202                })?;
203
204        let page_number = (page_idx + 1) as u32;
205        let width = page.width().value;
206        let height = page.height().value;
207
208        let mut layout = PageLayout::new(page_number, width, height);
209
210        // Extract text objects with positions
211        for object in page.objects().iter() {
212            if let Some(text_obj) = object.as_text_object() {
213                if let Ok(bounds) = object.bounds() {
214                    let text = text_obj.text();
215                    if !text.trim().is_empty() {
216                        layout.text_boxes.push(TextBox {
217                            text,
218                            x: bounds.left().value,
219                            y: bounds.bottom().value,
220                            width: bounds.right().value - bounds.left().value,
221                            height: bounds.top().value - bounds.bottom().value,
222                            font_size: text_obj.unscaled_font_size().value,
223                            page: page_number,
224                        });
225                    }
226                }
227            }
228
229            // Extract path objects for line detection
230            if let Some(path_obj) = object.as_path_object() {
231                extract_lines_from_path(&path_obj, page_number, &mut layout.lines);
232            }
233        }
234
235        layouts.push(layout);
236    }
237
238    Ok(layouts)
239}
240
241/// Extract line segments from a PDF path object.
242#[cfg(feature = "pdfium")]
243fn extract_lines_from_path(
244    path: &pdfium_render::prelude::PdfPagePathObject,
245    page: u32,
246    lines: &mut Vec<LineSegment>,
247) {
248    use pdfium_render::prelude::*;
249
250    let mut current_x = 0.0f32;
251    let mut current_y = 0.0f32;
252
253    for segment in path.segments().iter() {
254        match segment.segment_type() {
255            PdfPathSegmentType::MoveTo => {
256                // API changed: x() and y() return PdfPoints directly now
257                let x = segment.x();
258                let y = segment.y();
259                current_x = x.value;
260                current_y = y.value;
261            }
262            PdfPathSegmentType::LineTo => {
263                let x = segment.x();
264                let y = segment.y();
265                let new_x = x.value;
266                let new_y = y.value;
267
268                // Only add lines of significant length
269                let length = ((new_x - current_x).powi(2) + (new_y - current_y).powi(2)).sqrt();
270                if length > 5.0 {
271                    lines.push(LineSegment {
272                        x1: current_x,
273                        y1: current_y,
274                        x2: new_x,
275                        y2: new_y,
276                        page,
277                    });
278                }
279
280                current_x = new_x;
281                current_y = new_y;
282            }
283            PdfPathSegmentType::BezierTo => {
284                // For Bezier curves, just move to the endpoint
285                // (curves are rarely used for table borders)
286                let x = segment.x();
287                let y = segment.y();
288                current_x = x.value;
289                current_y = y.value;
290            }
291            _ => {
292                // Handles Unknown and any other segment types (like close path)
293                // For close path: connect back to move point
294                // For unknown: ignore
295            }
296        }
297    }
298}
299
300/// Fallback layout extraction using lopdf when pdfium is not available.
301///
302/// This provides basic text extraction with whitespace-based column detection.
303/// While not as accurate as pdfium's native text positioning, it can still
304/// detect tables with consistent column alignment.
305#[cfg(not(feature = "pdfium"))]
306pub fn extract_pdf_layout(bytes: &[u8], max_pages: usize) -> Result<Vec<PageLayout>> {
307    use lopdf::Document;
308
309    let document = Document::load_mem(bytes).map_err(|e| MemvidError::TableExtraction {
310        reason: format!("failed to load PDF with lopdf: {e}"),
311    })?;
312
313    let page_count = document.get_pages().len();
314    let pages_to_process = if max_pages > 0 {
315        page_count.min(max_pages)
316    } else {
317        page_count
318    };
319
320    let mut layouts = Vec::with_capacity(pages_to_process);
321
322    for page_idx in 0..pages_to_process {
323        let page_number = u32::try_from(page_idx + 1).unwrap_or(0);
324
325        // Get page dimensions (default to standard US Letter if not available)
326        let (width, height) = get_page_dimensions(&document, page_idx).unwrap_or((612.0, 792.0));
327
328        let mut layout = PageLayout::new(page_number, width, height);
329
330        // Extract text and parse whitespace-delimited columns
331        if let Ok(text) = document.extract_text(&[page_number]) {
332            let lines: Vec<&str> = text.lines().collect();
333            let line_height = if lines.is_empty() {
334                12.0
335            } else {
336                (height - 144.0) / lines.len() as f32 // Leave margins
337            };
338
339            for (line_idx, line) in lines.iter().enumerate() {
340                if line.trim().is_empty() {
341                    continue;
342                }
343
344                // Parse the line into whitespace-separated columns
345                // Use multiple spaces as column delimiter (common in PDF text extraction)
346                let text_boxes = parse_line_into_columns(
347                    line,
348                    line_idx,
349                    page_number,
350                    width,
351                    height,
352                    line_height,
353                );
354
355                layout.text_boxes.extend(text_boxes);
356            }
357        }
358
359        // lopdf doesn't easily expose path objects for line detection
360        layout.lines = Vec::new();
361
362        layouts.push(layout);
363    }
364
365    Ok(layouts)
366}
367
368/// Parse a line into separate text boxes based on whitespace patterns.
369///
370/// This uses multiple spaces (2+) as column delimiters, which is common
371/// in PDF text extraction output for tabular data.
372#[cfg(not(feature = "pdfium"))]
373fn parse_line_into_columns(
374    line: &str,
375    line_idx: usize,
376    page: u32,
377    page_width: f32,
378    page_height: f32,
379    line_height: f32,
380) -> Vec<TextBox> {
381    let mut boxes = Vec::new();
382    let y = page_height - 72.0 - (line_idx as f32 * line_height);
383
384    // Split on 2+ spaces to detect column boundaries
385    let re_split: Vec<&str> = line.split("  ").collect();
386
387    if re_split.len() > 1 {
388        // Multiple columns detected - assign positions based on split points
389        let usable_width = page_width - 144.0; // Leave 1-inch margins on each side
390        let col_width = usable_width / re_split.len() as f32;
391
392        for (col_idx, col_text) in re_split.iter().enumerate() {
393            let trimmed = col_text.trim();
394            if !trimmed.is_empty() {
395                let x = 72.0 + (col_idx as f32 * col_width);
396                boxes.push(TextBox {
397                    text: trimmed.to_string(),
398                    x,
399                    y,
400                    width: col_width * 0.9, // Slightly smaller than slot
401                    height: line_height,
402                    font_size: 12.0,
403                    page,
404                });
405            }
406        }
407    } else {
408        // Single column - try splitting on tabs or check for number patterns
409        let tab_split: Vec<&str> = line.split('\t').collect();
410
411        if tab_split.len() > 1 {
412            // Tab-separated
413            let usable_width = page_width - 144.0;
414            let col_width = usable_width / tab_split.len() as f32;
415
416            for (col_idx, col_text) in tab_split.iter().enumerate() {
417                let trimmed = col_text.trim();
418                if !trimmed.is_empty() {
419                    let x = 72.0 + (col_idx as f32 * col_width);
420                    boxes.push(TextBox {
421                        text: trimmed.to_string(),
422                        x,
423                        y,
424                        width: col_width * 0.9,
425                        height: line_height,
426                        font_size: 12.0,
427                        page,
428                    });
429                }
430            }
431        } else {
432            // Single text span - place at left margin
433            let trimmed = line.trim();
434            if !trimmed.is_empty() {
435                boxes.push(TextBox {
436                    text: trimmed.to_string(),
437                    x: 72.0,
438                    y,
439                    width: page_width - 144.0,
440                    height: line_height,
441                    font_size: 12.0,
442                    page,
443                });
444            }
445        }
446    }
447
448    boxes
449}
450
451/// Get page dimensions from a lopdf document.
452#[cfg(not(feature = "pdfium"))]
453fn get_page_dimensions(document: &lopdf::Document, page_idx: usize) -> Option<(f32, f32)> {
454    let pages = document.get_pages();
455    let page_id = *pages.get(&u32::try_from(page_idx + 1).unwrap_or(0))?;
456
457    if let Ok(page) = document.get_dictionary(page_id) {
458        if let Ok(media_box) = page.get(b"MediaBox") {
459            if let lopdf::Object::Array(arr) = media_box {
460                if arr.len() >= 4 {
461                    let width = match &arr[2] {
462                        lopdf::Object::Integer(n) => *n as f32,
463                        lopdf::Object::Real(n) => *n,
464                        _ => return None,
465                    };
466                    let height = match &arr[3] {
467                        lopdf::Object::Integer(n) => *n as f32,
468                        lopdf::Object::Real(n) => *n,
469                        _ => return None,
470                    };
471                    return Some((width, height));
472                }
473            }
474        }
475    }
476    None
477}
478
479/// Cluster values using a simple distance-based algorithm.
480///
481/// Groups values that are within `threshold` of each other
482/// and returns cluster centroids.
483#[must_use]
484pub fn cluster_values(values: &[f32], threshold: f32) -> Vec<f32> {
485    if values.is_empty() {
486        return Vec::new();
487    }
488
489    let mut sorted: Vec<f32> = values.to_vec();
490    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
491
492    let mut clusters: Vec<Vec<f32>> = Vec::new();
493    let mut current_cluster = vec![sorted[0]];
494
495    for &val in &sorted[1..] {
496        let last = current_cluster.last().copied().unwrap_or(val);
497        if val - last <= threshold {
498            current_cluster.push(val);
499        } else {
500            clusters.push(current_cluster);
501            current_cluster = vec![val];
502        }
503    }
504
505    if !current_cluster.is_empty() {
506        clusters.push(current_cluster);
507    }
508
509    // Return cluster centroids
510    clusters
511        .iter()
512        .map(|cluster| cluster.iter().sum::<f32>() / cluster.len() as f32)
513        .collect()
514}
515
516/// Filter cluster values to keep only those appearing consistently.
517#[allow(dead_code)]
518pub fn filter_consistent_values(
519    candidates: &[f32],
520    reference_values: &[f32],
521    threshold: f32,
522    min_occurrences: usize,
523) -> Vec<f32> {
524    candidates
525        .iter()
526        .filter(|&&candidate| {
527            let count = reference_values
528                .iter()
529                .filter(|&&v| (v - candidate).abs() <= threshold)
530                .count();
531            count >= min_occurrences
532        })
533        .copied()
534        .collect()
535}
536
537#[cfg(test)]
538mod tests {
539    use super::*;
540
541    #[test]
542    fn test_text_box_geometry() {
543        let tbox = TextBox {
544            text: "Hello".to_string(),
545            x: 100.0,
546            y: 200.0,
547            width: 50.0,
548            height: 20.0,
549            font_size: 12.0,
550            page: 1,
551        };
552
553        assert!((tbox.right() - 150.0).abs() < 0.001);
554        assert!((tbox.top() - 220.0).abs() < 0.001);
555        assert!((tbox.center_x() - 125.0).abs() < 0.001);
556        assert!((tbox.center_y() - 210.0).abs() < 0.001);
557    }
558
559    #[test]
560    fn test_line_segment_orientation() {
561        let h_line = LineSegment {
562            x1: 0.0,
563            y1: 100.0,
564            x2: 200.0,
565            y2: 100.0,
566            page: 1,
567        };
568        assert!(h_line.is_horizontal(1.0));
569        assert!(!h_line.is_vertical(1.0));
570
571        let v_line = LineSegment {
572            x1: 100.0,
573            y1: 0.0,
574            x2: 100.0,
575            y2: 200.0,
576            page: 1,
577        };
578        assert!(!v_line.is_horizontal(1.0));
579        assert!(v_line.is_vertical(1.0));
580    }
581
582    #[test]
583    fn test_cluster_values() {
584        let values = vec![10.0, 11.0, 12.0, 50.0, 51.0, 100.0];
585        let clusters = cluster_values(&values, 5.0);
586
587        assert_eq!(clusters.len(), 3);
588        // First cluster around 11
589        assert!((clusters[0] - 11.0).abs() < 1.0);
590        // Second cluster around 50.5
591        assert!((clusters[1] - 50.5).abs() < 1.0);
592        // Third cluster at 100
593        assert!((clusters[2] - 100.0).abs() < 1.0);
594    }
595
596    #[test]
597    fn test_page_layout_line_filtering() {
598        let mut layout = PageLayout::new(1, 612.0, 792.0);
599        layout.lines.push(LineSegment {
600            x1: 0.0,
601            y1: 100.0,
602            x2: 200.0,
603            y2: 100.0,
604            page: 1,
605        }); // horizontal
606        layout.lines.push(LineSegment {
607            x1: 100.0,
608            y1: 0.0,
609            x2: 100.0,
610            y2: 200.0,
611            page: 1,
612        }); // vertical
613        layout.lines.push(LineSegment {
614            x1: 0.0,
615            y1: 0.0,
616            x2: 200.0,
617            y2: 200.0,
618            page: 1,
619        }); // diagonal
620
621        assert_eq!(layout.horizontal_lines(2.0).len(), 1);
622        assert_eq!(layout.vertical_lines(2.0).len(), 1);
623    }
624}