1use crate::graphics::extraction::{ExtractedGraphics, LineOrientation, VectorLine};
47use crate::text::extraction::TextFragment;
48use thiserror::Error;
49
50#[derive(Debug, Error)]
52pub enum TableDetectionError {
53 #[error("Invalid coordinate value: expected valid f64, found NaN or Infinity")]
55 InvalidCoordinate,
56
57 #[error("Invalid grid: {0}")]
59 InvalidGrid(String),
60
61 #[error("Internal error: {0}")]
63 InternalError(String),
64}
65
66#[derive(Debug, Clone)]
68pub struct TableDetectionConfig {
69 pub min_rows: usize,
71 pub min_columns: usize,
73 pub alignment_tolerance: f64,
75 pub min_table_area: f64,
77 pub detect_borderless: bool,
79}
80
81impl Default for TableDetectionConfig {
82 fn default() -> Self {
83 Self {
84 min_rows: 2,
85 min_columns: 2,
86 alignment_tolerance: 2.0, min_table_area: 1000.0, detect_borderless: false, }
90 }
91}
92
93#[derive(Debug, Clone)]
95pub struct DetectedTable {
96 pub bbox: BoundingBox,
98 pub cells: Vec<TableCell>,
100 pub rows: usize,
102 pub columns: usize,
104 pub confidence: f64,
106}
107
108impl DetectedTable {
109 pub fn new(bbox: BoundingBox, cells: Vec<TableCell>, rows: usize, columns: usize) -> Self {
111 let confidence = Self::calculate_confidence(&cells, rows, columns);
112 Self {
113 bbox,
114 cells,
115 rows,
116 columns,
117 confidence,
118 }
119 }
120
121 pub fn row_count(&self) -> usize {
123 self.rows
124 }
125
126 pub fn column_count(&self) -> usize {
128 self.columns
129 }
130
131 pub fn get_cell(&self, row: usize, col: usize) -> Option<&TableCell> {
133 if row >= self.rows || col >= self.columns {
134 return None;
135 }
136 let index = row * self.columns + col;
137 self.cells.get(index)
138 }
139
140 fn calculate_confidence(cells: &[TableCell], rows: usize, columns: usize) -> f64 {
142 if rows == 0 || columns == 0 {
143 return 0.0;
144 }
145
146 let total_cells = rows * columns;
147 let populated_cells = cells.iter().filter(|c| !c.text.is_empty()).count();
148
149 let population_ratio = populated_cells as f64 / total_cells as f64;
151
152 let size_bonus = ((rows + columns) as f64 / 10.0).min(0.2);
154
155 (population_ratio + size_bonus).min(1.0)
156 }
157}
158
159#[derive(Debug, Clone)]
161pub struct TableCell {
162 pub row: usize,
164 pub column: usize,
166 pub bbox: BoundingBox,
168 pub text: String,
170 pub has_borders: bool,
172}
173
174impl TableCell {
175 pub fn new(row: usize, column: usize, bbox: BoundingBox) -> Self {
177 Self {
178 row,
179 column,
180 bbox,
181 text: String::new(),
182 has_borders: false,
183 }
184 }
185
186 pub fn set_text(&mut self, text: String) {
188 self.text = text;
189 }
190
191 pub fn is_empty(&self) -> bool {
193 self.text.is_empty()
194 }
195}
196
197#[derive(Debug, Clone, Copy)]
199pub struct BoundingBox {
200 pub x: f64,
202 pub y: f64,
204 pub width: f64,
206 pub height: f64,
208}
209
210impl BoundingBox {
211 pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
213 Self {
214 x,
215 y,
216 width,
217 height,
218 }
219 }
220
221 pub fn right(&self) -> f64 {
223 self.x + self.width
224 }
225
226 pub fn top(&self) -> f64 {
228 self.y + self.height
229 }
230
231 pub fn contains_point(&self, px: f64, py: f64) -> bool {
233 px >= self.x && px <= self.right() && py >= self.y && py <= self.top()
234 }
235
236 pub fn area(&self) -> f64 {
238 self.width * self.height
239 }
240}
241
242pub struct TableDetector {
244 config: TableDetectionConfig,
245}
246
247impl TableDetector {
248 pub fn new(config: TableDetectionConfig) -> Self {
250 Self { config }
251 }
252
253 pub fn default() -> Self {
255 Self::new(TableDetectionConfig::default())
256 }
257
258 pub fn detect(
269 &self,
270 graphics: &ExtractedGraphics,
271 text_fragments: &[TextFragment],
272 ) -> Result<Vec<DetectedTable>, TableDetectionError> {
273 let mut tables = Vec::new();
274
275 if !graphics.has_table_structure() {
277 return Ok(tables);
278 }
279
280 if let Some(table) = self.detect_bordered_table(graphics, text_fragments)? {
282 tables.push(table);
283 }
284
285 if self.config.detect_borderless {
287 }
292
293 tables.sort_by(|a, b| {
295 b.confidence
296 .partial_cmp(&a.confidence)
297 .unwrap_or(std::cmp::Ordering::Equal)
298 });
299
300 Ok(tables)
301 }
302
303 fn detect_bordered_table(
305 &self,
306 graphics: &ExtractedGraphics,
307 text_fragments: &[TextFragment],
308 ) -> Result<Option<DetectedTable>, TableDetectionError> {
309 let h_lines: Vec<&VectorLine> = graphics.horizontal_lines().collect();
311 let v_lines: Vec<&VectorLine> = graphics.vertical_lines().collect();
312
313 let grid = self.detect_grid_pattern(&h_lines, &v_lines)?;
315
316 if grid.rows.len() < self.config.min_rows || grid.columns.len() < self.config.min_columns {
317 return Ok(None);
318 }
319
320 let cells = self.create_cells_from_grid(&grid);
322
323 let cells_with_text = self.assign_text_to_cells(cells, text_fragments);
325
326 let bbox = self.calculate_table_bbox(&grid)?;
328
329 if bbox.area() < self.config.min_table_area {
331 return Ok(None);
332 }
333
334 let num_rows = grid.rows.len().saturating_sub(1);
336 let num_cols = grid.columns.len().saturating_sub(1);
337
338 let table = DetectedTable::new(bbox, cells_with_text, num_rows, num_cols);
339
340 Ok(Some(table))
341 }
342
343 fn detect_grid_pattern(
345 &self,
346 h_lines: &[&VectorLine],
347 v_lines: &[&VectorLine],
348 ) -> Result<GridPattern, TableDetectionError> {
349 let mut rows = self.cluster_lines_by_position(h_lines, LineOrientation::Horizontal)?;
351
352 let columns = self.cluster_lines_by_position(v_lines, LineOrientation::Vertical)?;
354
355 rows.reverse();
357
358 Ok(GridPattern { rows, columns })
359 }
360
361 fn cluster_lines_by_position(
363 &self,
364 lines: &[&VectorLine],
365 orientation: LineOrientation,
366 ) -> Result<Vec<f64>, TableDetectionError> {
367 if lines.is_empty() {
368 return Ok(vec![]);
369 }
370
371 let mut positions: Vec<f64> = lines
373 .iter()
374 .map(|line| match orientation {
375 LineOrientation::Horizontal => line.y1, LineOrientation::Vertical => line.x1, _ => 0.0,
378 })
379 .collect();
380
381 positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
383
384 if positions.iter().any(|p| !p.is_finite()) {
386 return Err(TableDetectionError::InvalidCoordinate);
387 }
388
389 let mut clusters: Vec<Vec<f64>> = vec![vec![positions[0]]];
391
392 for &pos in &positions[1..] {
393 let last_cluster = clusters.last_mut().ok_or_else(|| {
394 TableDetectionError::InternalError("cluster list unexpectedly empty".to_string())
395 })?;
396 let cluster_mean = last_cluster.iter().sum::<f64>() / last_cluster.len() as f64;
397
398 if (pos - cluster_mean).abs() <= self.config.alignment_tolerance {
399 last_cluster.push(pos);
401 } else {
402 clusters.push(vec![pos]);
404 }
405 }
406
407 Ok(clusters
409 .iter()
410 .map(|cluster| cluster.iter().sum::<f64>() / cluster.len() as f64)
411 .collect())
412 }
413
414 fn create_cells_from_grid(&self, grid: &GridPattern) -> Vec<TableCell> {
416 let mut cells = Vec::new();
417
418 let num_rows = grid.rows.len().saturating_sub(1);
420 let num_cols = grid.columns.len().saturating_sub(1);
421
422 if num_rows == 0 || num_cols == 0 {
423 return cells;
424 }
425
426 for row_idx in 0..num_rows {
428 let y1 = grid.rows[row_idx];
429 let y2 = grid.rows[row_idx + 1];
430
431 let row_y = y1.min(y2);
433 let row_height = (y2 - y1).abs();
434
435 for col_idx in 0..num_cols {
436 let col_x = grid.columns[col_idx];
437 let col_width = (grid.columns[col_idx + 1] - col_x).abs();
438
439 let bbox = BoundingBox::new(col_x, row_y, col_width, row_height);
440 let mut cell = TableCell::new(row_idx, col_idx, bbox);
441 cell.has_borders = true;
442
443 cells.push(cell);
444 }
445 }
446
447 cells
448 }
449
450 fn assign_text_to_cells(
457 &self,
458 mut cells: Vec<TableCell>,
459 text_fragments: &[TextFragment],
460 ) -> Vec<TableCell> {
461 if text_fragments.is_empty() || cells.is_empty() {
462 return cells;
463 }
464
465 let normalized_fragments = normalize_coordinates_if_needed(&cells, text_fragments);
467
468 for cell in &mut cells {
469 let mut cell_texts = Vec::new();
470
471 for fragment in &normalized_fragments {
472 let center_x = fragment.x + fragment.width / 2.0;
474 let center_y = fragment.y + fragment.height / 2.0;
475
476 if cell.bbox.contains_point(center_x, center_y) {
477 cell_texts.push(fragment.text.clone());
478 }
479 }
480
481 if !cell_texts.is_empty() {
482 cell.text = cell_texts.join(" ");
483 }
484 }
485
486 cells
487 }
488
489 fn calculate_table_bbox(&self, grid: &GridPattern) -> Result<BoundingBox, TableDetectionError> {
491 let min_x = *grid
492 .columns
493 .first()
494 .ok_or_else(|| TableDetectionError::InvalidGrid("no columns".to_string()))?;
495 let max_x = *grid
496 .columns
497 .last()
498 .ok_or_else(|| TableDetectionError::InvalidGrid("no columns".to_string()))?;
499
500 let first_y = *grid
502 .rows
503 .first()
504 .ok_or_else(|| TableDetectionError::InvalidGrid("no rows".to_string()))?;
505 let last_y = *grid
506 .rows
507 .last()
508 .ok_or_else(|| TableDetectionError::InvalidGrid("no rows".to_string()))?;
509 let min_y = first_y.min(last_y);
510 let max_y = first_y.max(last_y);
511
512 Ok(BoundingBox::new(min_x, min_y, max_x - min_x, max_y - min_y))
513 }
514}
515
516struct GridPattern {
518 rows: Vec<f64>,
520 columns: Vec<f64>,
522}
523
524impl Default for TableDetector {
525 fn default() -> Self {
526 Self::new(TableDetectionConfig::default())
527 }
528}
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533
534 #[test]
535 fn test_bounding_box_contains_point() {
536 let bbox = BoundingBox::new(100.0, 100.0, 100.0, 50.0);
537
538 assert!(bbox.contains_point(150.0, 125.0)); assert!(bbox.contains_point(100.0, 100.0)); assert!(bbox.contains_point(200.0, 150.0)); assert!(!bbox.contains_point(50.0, 125.0)); assert!(!bbox.contains_point(250.0, 125.0)); assert!(!bbox.contains_point(150.0, 50.0)); assert!(!bbox.contains_point(150.0, 200.0)); }
546
547 #[test]
548 fn test_bounding_box_area() {
549 let bbox = BoundingBox::new(0.0, 0.0, 100.0, 50.0);
550 assert!((bbox.area() - 5000.0).abs() < 0.01);
551 }
552
553 #[test]
554 fn test_table_cell_new() {
555 let bbox = BoundingBox::new(0.0, 0.0, 50.0, 25.0);
556 let cell = TableCell::new(1, 2, bbox);
557
558 assert_eq!(cell.row, 1);
559 assert_eq!(cell.column, 2);
560 assert!(cell.is_empty());
561 assert!(!cell.has_borders);
562 }
563
564 #[test]
565 fn test_table_cell_set_text() {
566 let bbox = BoundingBox::new(0.0, 0.0, 50.0, 25.0);
567 let mut cell = TableCell::new(0, 0, bbox);
568
569 cell.set_text("Test".to_string());
570 assert_eq!(cell.text, "Test");
571 assert!(!cell.is_empty());
572 }
573
574 #[test]
575 fn test_detected_table_get_cell() {
576 let bbox = BoundingBox::new(0.0, 0.0, 200.0, 100.0);
577 let cells = vec![
578 TableCell::new(0, 0, BoundingBox::new(0.0, 0.0, 100.0, 50.0)),
579 TableCell::new(0, 1, BoundingBox::new(100.0, 0.0, 100.0, 50.0)),
580 TableCell::new(1, 0, BoundingBox::new(0.0, 50.0, 100.0, 50.0)),
581 TableCell::new(1, 1, BoundingBox::new(100.0, 50.0, 100.0, 50.0)),
582 ];
583
584 let table = DetectedTable::new(bbox, cells, 2, 2);
585
586 assert_eq!(table.row_count(), 2);
587 assert_eq!(table.column_count(), 2);
588
589 let cell = table.get_cell(0, 0).expect("cell (0,0) should exist");
590 assert_eq!(cell.row, 0);
591 assert_eq!(cell.column, 0);
592
593 assert!(table.get_cell(2, 0).is_none()); assert!(table.get_cell(0, 2).is_none()); }
596
597 #[test]
598 fn test_table_detection_config_default() {
599 let config = TableDetectionConfig::default();
600 assert_eq!(config.min_rows, 2);
601 assert_eq!(config.min_columns, 2);
602 assert_eq!(config.alignment_tolerance, 2.0);
603 assert!(!config.detect_borderless);
604 }
605}
606
607fn normalize_coordinates_if_needed(
620 cells: &[TableCell],
621 text_fragments: &[TextFragment],
622) -> Vec<TextFragment> {
623 let cell_bbox = calculate_combined_bbox_cells(cells);
625 let text_bbox = calculate_combined_bbox_fragments(text_fragments);
626
627 let x_overlap = text_bbox.0 < cell_bbox.2 && text_bbox.2 > cell_bbox.0;
629 let y_overlap = text_bbox.1 < cell_bbox.3 && text_bbox.3 > cell_bbox.1;
630
631 if x_overlap && y_overlap {
633 return text_fragments.to_vec();
634 }
635
636 let text_width = text_bbox.2 - text_bbox.0;
638 let text_height = text_bbox.3 - text_bbox.1;
639 let cell_width = cell_bbox.2 - cell_bbox.0;
640 let cell_height = cell_bbox.3 - cell_bbox.1;
641
642 let scale_x = if text_width > 0.0 {
643 cell_width / text_width
644 } else {
645 1.0
646 };
647 let scale_y = if text_height > 0.0 {
648 cell_height / text_height
649 } else {
650 1.0
651 };
652
653 let translate_x = cell_bbox.0 - (text_bbox.0 * scale_x);
654 let translate_y = cell_bbox.1 - (text_bbox.1 * scale_y);
655
656 text_fragments
658 .iter()
659 .map(|frag| TextFragment {
660 text: frag.text.clone(),
661 x: frag.x * scale_x + translate_x,
662 y: frag.y * scale_y + translate_y,
663 width: frag.width * scale_x,
664 height: frag.height * scale_y,
665 font_size: frag.font_size,
666 font_name: frag.font_name.clone(),
667 is_bold: frag.is_bold,
668 is_italic: frag.is_italic,
669 color: frag.color,
670 })
671 .collect()
672}
673
674fn calculate_combined_bbox_cells(cells: &[TableCell]) -> (f64, f64, f64, f64) {
676 let min_x = cells.iter().map(|c| c.bbox.x).fold(f64::INFINITY, f64::min);
677 let max_x = cells
678 .iter()
679 .map(|c| c.bbox.right())
680 .fold(f64::NEG_INFINITY, f64::max);
681 let min_y = cells.iter().map(|c| c.bbox.y).fold(f64::INFINITY, f64::min);
682 let max_y = cells
683 .iter()
684 .map(|c| c.bbox.top())
685 .fold(f64::NEG_INFINITY, f64::max);
686 (min_x, min_y, max_x, max_y)
687}
688
689fn calculate_combined_bbox_fragments(fragments: &[TextFragment]) -> (f64, f64, f64, f64) {
691 let min_x = fragments.iter().map(|f| f.x).fold(f64::INFINITY, f64::min);
692 let max_x = fragments
693 .iter()
694 .map(|f| f.x + f.width)
695 .fold(f64::NEG_INFINITY, f64::max);
696 let min_y = fragments.iter().map(|f| f.y).fold(f64::INFINITY, f64::min);
697 let max_y = fragments
698 .iter()
699 .map(|f| f.y + f.height)
700 .fold(f64::NEG_INFINITY, f64::max);
701 (min_x, min_y, max_x, max_y)
702}