1use crate::graphics::extraction::{ExtractedGraphics, LineOrientation, VectorLine};
47use crate::text::extraction::TextFragment;
48use thiserror::Error;
49
50#[derive(Debug, Error)]
52pub enum TableDetectionError {
53 #[error("Invalid coordinate value: expected valid f64, found NaN or Infinity")]
55 InvalidCoordinate,
56
57 #[error("Invalid grid: {0}")]
59 InvalidGrid(String),
60
61 #[error("Internal error: {0}")]
63 InternalError(String),
64}
65
66#[derive(Debug, Clone)]
68pub struct TableDetectionConfig {
69 pub min_rows: usize,
71 pub min_columns: usize,
73 pub alignment_tolerance: f64,
75 pub min_table_area: f64,
77 pub detect_borderless: bool,
79}
80
81impl Default for TableDetectionConfig {
82 fn default() -> Self {
83 Self {
84 min_rows: 2,
85 min_columns: 2,
86 alignment_tolerance: 2.0, min_table_area: 1000.0, detect_borderless: false, }
90 }
91}
92
93#[derive(Debug, Clone)]
95pub struct DetectedTable {
96 pub bbox: BoundingBox,
98 pub cells: Vec<TableCell>,
100 pub rows: usize,
102 pub columns: usize,
104 pub confidence: f64,
106}
107
108impl DetectedTable {
109 pub fn new(bbox: BoundingBox, cells: Vec<TableCell>, rows: usize, columns: usize) -> Self {
111 let confidence = Self::calculate_confidence(&cells, rows, columns);
112 Self {
113 bbox,
114 cells,
115 rows,
116 columns,
117 confidence,
118 }
119 }
120
121 pub fn row_count(&self) -> usize {
123 self.rows
124 }
125
126 pub fn column_count(&self) -> usize {
128 self.columns
129 }
130
131 pub fn get_cell(&self, row: usize, col: usize) -> Option<&TableCell> {
133 if row >= self.rows || col >= self.columns {
134 return None;
135 }
136 let index = row * self.columns + col;
137 self.cells.get(index)
138 }
139
140 fn calculate_confidence(cells: &[TableCell], rows: usize, columns: usize) -> f64 {
142 if rows == 0 || columns == 0 {
143 return 0.0;
144 }
145
146 let total_cells = rows * columns;
147 let populated_cells = cells.iter().filter(|c| !c.text.is_empty()).count();
148
149 let population_ratio = populated_cells as f64 / total_cells as f64;
151
152 let size_bonus = ((rows + columns) as f64 / 10.0).min(0.2);
154
155 (population_ratio + size_bonus).min(1.0)
156 }
157}
158
159#[derive(Debug, Clone)]
161pub struct TableCell {
162 pub row: usize,
164 pub column: usize,
166 pub bbox: BoundingBox,
168 pub text: String,
170 pub has_borders: bool,
172}
173
174impl TableCell {
175 pub fn new(row: usize, column: usize, bbox: BoundingBox) -> Self {
177 Self {
178 row,
179 column,
180 bbox,
181 text: String::new(),
182 has_borders: false,
183 }
184 }
185
186 pub fn set_text(&mut self, text: String) {
188 self.text = text;
189 }
190
191 pub fn is_empty(&self) -> bool {
193 self.text.is_empty()
194 }
195}
196
197#[derive(Debug, Clone, Copy)]
199pub struct BoundingBox {
200 pub x: f64,
202 pub y: f64,
204 pub width: f64,
206 pub height: f64,
208}
209
210impl BoundingBox {
211 pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
213 Self {
214 x,
215 y,
216 width,
217 height,
218 }
219 }
220
221 pub fn right(&self) -> f64 {
223 self.x + self.width
224 }
225
226 pub fn top(&self) -> f64 {
228 self.y + self.height
229 }
230
231 pub fn contains_point(&self, px: f64, py: f64) -> bool {
233 px >= self.x && px <= self.right() && py >= self.y && py <= self.top()
234 }
235
236 pub fn area(&self) -> f64 {
238 self.width * self.height
239 }
240}
241
242pub struct TableDetector {
244 config: TableDetectionConfig,
245}
246
247impl TableDetector {
248 pub fn new(config: TableDetectionConfig) -> Self {
250 Self { config }
251 }
252
253 pub fn default() -> Self {
255 Self::new(TableDetectionConfig::default())
256 }
257
258 pub fn detect(
269 &self,
270 graphics: &ExtractedGraphics,
271 text_fragments: &[TextFragment],
272 ) -> Result<Vec<DetectedTable>, TableDetectionError> {
273 let mut tables = Vec::new();
274
275 if !graphics.has_table_structure() {
277 return Ok(tables);
278 }
279
280 if let Some(table) = self.detect_bordered_table(graphics, text_fragments)? {
282 tables.push(table);
283 }
284
285 if self.config.detect_borderless {
287 }
292
293 tables.sort_by(|a, b| b.confidence.total_cmp(&a.confidence));
296
297 Ok(tables)
298 }
299
300 fn detect_bordered_table(
302 &self,
303 graphics: &ExtractedGraphics,
304 text_fragments: &[TextFragment],
305 ) -> Result<Option<DetectedTable>, TableDetectionError> {
306 let h_lines: Vec<&VectorLine> = graphics.horizontal_lines().collect();
308 let v_lines: Vec<&VectorLine> = graphics.vertical_lines().collect();
309
310 let grid = self.detect_grid_pattern(&h_lines, &v_lines)?;
312
313 if grid.rows.len() < self.config.min_rows || grid.columns.len() < self.config.min_columns {
314 return Ok(None);
315 }
316
317 let cells = self.create_cells_from_grid(&grid);
319
320 let cells_with_text = self.assign_text_to_cells(cells, text_fragments);
322
323 let bbox = self.calculate_table_bbox(&grid)?;
325
326 if bbox.area() < self.config.min_table_area {
328 return Ok(None);
329 }
330
331 let num_rows = grid.rows.len().saturating_sub(1);
333 let num_cols = grid.columns.len().saturating_sub(1);
334
335 let table = DetectedTable::new(bbox, cells_with_text, num_rows, num_cols);
336
337 Ok(Some(table))
338 }
339
340 fn detect_grid_pattern(
342 &self,
343 h_lines: &[&VectorLine],
344 v_lines: &[&VectorLine],
345 ) -> Result<GridPattern, TableDetectionError> {
346 let mut rows = self.cluster_lines_by_position(h_lines, LineOrientation::Horizontal)?;
348
349 let columns = self.cluster_lines_by_position(v_lines, LineOrientation::Vertical)?;
351
352 rows.reverse();
354
355 Ok(GridPattern { rows, columns })
356 }
357
358 fn cluster_lines_by_position(
360 &self,
361 lines: &[&VectorLine],
362 orientation: LineOrientation,
363 ) -> Result<Vec<f64>, TableDetectionError> {
364 if lines.is_empty() {
365 return Ok(vec![]);
366 }
367
368 let mut positions: Vec<f64> = lines
370 .iter()
371 .map(|line| match orientation {
372 LineOrientation::Horizontal => line.y1, LineOrientation::Vertical => line.x1, _ => 0.0,
375 })
376 .collect();
377
378 if positions.iter().any(|p| !p.is_finite()) {
380 return Err(TableDetectionError::InvalidCoordinate);
381 }
382
383 positions.sort_by(|a, b| a.total_cmp(b));
385
386 let mut clusters: Vec<Vec<f64>> = vec![vec![positions[0]]];
388
389 for &pos in &positions[1..] {
390 let last_cluster = clusters.last_mut().ok_or_else(|| {
391 TableDetectionError::InternalError("cluster list unexpectedly empty".to_string())
392 })?;
393 let cluster_mean = last_cluster.iter().sum::<f64>() / last_cluster.len() as f64;
394
395 if (pos - cluster_mean).abs() <= self.config.alignment_tolerance {
396 last_cluster.push(pos);
398 } else {
399 clusters.push(vec![pos]);
401 }
402 }
403
404 Ok(clusters
406 .iter()
407 .map(|cluster| cluster.iter().sum::<f64>() / cluster.len() as f64)
408 .collect())
409 }
410
411 fn create_cells_from_grid(&self, grid: &GridPattern) -> Vec<TableCell> {
413 let mut cells = Vec::new();
414
415 let num_rows = grid.rows.len().saturating_sub(1);
417 let num_cols = grid.columns.len().saturating_sub(1);
418
419 if num_rows == 0 || num_cols == 0 {
420 return cells;
421 }
422
423 for row_idx in 0..num_rows {
425 let y1 = grid.rows[row_idx];
426 let y2 = grid.rows[row_idx + 1];
427
428 let row_y = y1.min(y2);
430 let row_height = (y2 - y1).abs();
431
432 for col_idx in 0..num_cols {
433 let col_x = grid.columns[col_idx];
434 let col_width = (grid.columns[col_idx + 1] - col_x).abs();
435
436 let bbox = BoundingBox::new(col_x, row_y, col_width, row_height);
437 let mut cell = TableCell::new(row_idx, col_idx, bbox);
438 cell.has_borders = true;
439
440 cells.push(cell);
441 }
442 }
443
444 cells
445 }
446
447 fn assign_text_to_cells(
454 &self,
455 mut cells: Vec<TableCell>,
456 text_fragments: &[TextFragment],
457 ) -> Vec<TableCell> {
458 if text_fragments.is_empty() || cells.is_empty() {
459 return cells;
460 }
461
462 let normalized_fragments = normalize_coordinates_if_needed(&cells, text_fragments);
464
465 for cell in &mut cells {
466 let mut cell_texts = Vec::new();
467
468 for fragment in &normalized_fragments {
469 let center_x = fragment.x + fragment.width / 2.0;
471 let center_y = fragment.y + fragment.height / 2.0;
472
473 if cell.bbox.contains_point(center_x, center_y) {
474 cell_texts.push(fragment.text.clone());
475 }
476 }
477
478 if !cell_texts.is_empty() {
479 cell.text = cell_texts.join(" ");
480 }
481 }
482
483 cells
484 }
485
486 fn calculate_table_bbox(&self, grid: &GridPattern) -> Result<BoundingBox, TableDetectionError> {
488 let min_x = *grid
489 .columns
490 .first()
491 .ok_or_else(|| TableDetectionError::InvalidGrid("no columns".to_string()))?;
492 let max_x = *grid
493 .columns
494 .last()
495 .ok_or_else(|| TableDetectionError::InvalidGrid("no columns".to_string()))?;
496
497 let first_y = *grid
499 .rows
500 .first()
501 .ok_or_else(|| TableDetectionError::InvalidGrid("no rows".to_string()))?;
502 let last_y = *grid
503 .rows
504 .last()
505 .ok_or_else(|| TableDetectionError::InvalidGrid("no rows".to_string()))?;
506 let min_y = first_y.min(last_y);
507 let max_y = first_y.max(last_y);
508
509 Ok(BoundingBox::new(min_x, min_y, max_x - min_x, max_y - min_y))
510 }
511}
512
513struct GridPattern {
515 rows: Vec<f64>,
517 columns: Vec<f64>,
519}
520
521impl Default for TableDetector {
522 fn default() -> Self {
523 Self::new(TableDetectionConfig::default())
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 fn test_bounding_box_contains_point() {
533 let bbox = BoundingBox::new(100.0, 100.0, 100.0, 50.0);
534
535 assert!(bbox.contains_point(150.0, 125.0)); assert!(bbox.contains_point(100.0, 100.0)); assert!(bbox.contains_point(200.0, 150.0)); assert!(!bbox.contains_point(50.0, 125.0)); assert!(!bbox.contains_point(250.0, 125.0)); assert!(!bbox.contains_point(150.0, 50.0)); assert!(!bbox.contains_point(150.0, 200.0)); }
543
544 #[test]
545 fn test_bounding_box_area() {
546 let bbox = BoundingBox::new(0.0, 0.0, 100.0, 50.0);
547 assert!((bbox.area() - 5000.0).abs() < 0.01);
548 }
549
550 #[test]
551 fn test_table_cell_new() {
552 let bbox = BoundingBox::new(0.0, 0.0, 50.0, 25.0);
553 let cell = TableCell::new(1, 2, bbox);
554
555 assert_eq!(cell.row, 1);
556 assert_eq!(cell.column, 2);
557 assert!(cell.is_empty());
558 assert!(!cell.has_borders);
559 }
560
561 #[test]
562 fn test_table_cell_set_text() {
563 let bbox = BoundingBox::new(0.0, 0.0, 50.0, 25.0);
564 let mut cell = TableCell::new(0, 0, bbox);
565
566 cell.set_text("Test".to_string());
567 assert_eq!(cell.text, "Test");
568 assert!(!cell.is_empty());
569 }
570
571 #[test]
572 fn test_detected_table_get_cell() {
573 let bbox = BoundingBox::new(0.0, 0.0, 200.0, 100.0);
574 let cells = vec![
575 TableCell::new(0, 0, BoundingBox::new(0.0, 0.0, 100.0, 50.0)),
576 TableCell::new(0, 1, BoundingBox::new(100.0, 0.0, 100.0, 50.0)),
577 TableCell::new(1, 0, BoundingBox::new(0.0, 50.0, 100.0, 50.0)),
578 TableCell::new(1, 1, BoundingBox::new(100.0, 50.0, 100.0, 50.0)),
579 ];
580
581 let table = DetectedTable::new(bbox, cells, 2, 2);
582
583 assert_eq!(table.row_count(), 2);
584 assert_eq!(table.column_count(), 2);
585
586 let cell = table.get_cell(0, 0).expect("cell (0,0) should exist");
587 assert_eq!(cell.row, 0);
588 assert_eq!(cell.column, 0);
589
590 assert!(table.get_cell(2, 0).is_none()); assert!(table.get_cell(0, 2).is_none()); }
593
594 #[test]
595 fn test_table_detection_config_default() {
596 let config = TableDetectionConfig::default();
597 assert_eq!(config.min_rows, 2);
598 assert_eq!(config.min_columns, 2);
599 assert_eq!(config.alignment_tolerance, 2.0);
600 assert!(!config.detect_borderless);
601 }
602}
603
604fn normalize_coordinates_if_needed(
617 cells: &[TableCell],
618 text_fragments: &[TextFragment],
619) -> Vec<TextFragment> {
620 let cell_bbox = calculate_combined_bbox_cells(cells);
622 let text_bbox = calculate_combined_bbox_fragments(text_fragments);
623
624 let x_overlap = text_bbox.0 < cell_bbox.2 && text_bbox.2 > cell_bbox.0;
626 let y_overlap = text_bbox.1 < cell_bbox.3 && text_bbox.3 > cell_bbox.1;
627
628 if x_overlap && y_overlap {
630 return text_fragments.to_vec();
631 }
632
633 let text_width = text_bbox.2 - text_bbox.0;
635 let text_height = text_bbox.3 - text_bbox.1;
636 let cell_width = cell_bbox.2 - cell_bbox.0;
637 let cell_height = cell_bbox.3 - cell_bbox.1;
638
639 let scale_x = if text_width > 0.0 {
640 cell_width / text_width
641 } else {
642 1.0
643 };
644 let scale_y = if text_height > 0.0 {
645 cell_height / text_height
646 } else {
647 1.0
648 };
649
650 let translate_x = cell_bbox.0 - (text_bbox.0 * scale_x);
651 let translate_y = cell_bbox.1 - (text_bbox.1 * scale_y);
652
653 text_fragments
655 .iter()
656 .map(|frag| TextFragment {
657 text: frag.text.clone(),
658 x: frag.x * scale_x + translate_x,
659 y: frag.y * scale_y + translate_y,
660 width: frag.width * scale_x,
661 height: frag.height * scale_y,
662 font_size: frag.font_size,
663 font_name: frag.font_name.clone(),
664 is_bold: frag.is_bold,
665 is_italic: frag.is_italic,
666 color: frag.color,
667 space_decisions: Vec::new(),
668 })
669 .collect()
670}
671
672fn calculate_combined_bbox_cells(cells: &[TableCell]) -> (f64, f64, f64, f64) {
674 let min_x = cells.iter().map(|c| c.bbox.x).fold(f64::INFINITY, f64::min);
675 let max_x = cells
676 .iter()
677 .map(|c| c.bbox.right())
678 .fold(f64::NEG_INFINITY, f64::max);
679 let min_y = cells.iter().map(|c| c.bbox.y).fold(f64::INFINITY, f64::min);
680 let max_y = cells
681 .iter()
682 .map(|c| c.bbox.top())
683 .fold(f64::NEG_INFINITY, f64::max);
684 (min_x, min_y, max_x, max_y)
685}
686
687fn calculate_combined_bbox_fragments(fragments: &[TextFragment]) -> (f64, f64, f64, f64) {
689 let min_x = fragments.iter().map(|f| f.x).fold(f64::INFINITY, f64::min);
690 let max_x = fragments
691 .iter()
692 .map(|f| f.x + f.width)
693 .fold(f64::NEG_INFINITY, f64::max);
694 let min_y = fragments.iter().map(|f| f.y).fold(f64::INFINITY, f64::min);
695 let max_y = fragments
696 .iter()
697 .map(|f| f.y + f.height)
698 .fold(f64::NEG_INFINITY, f64::max);
699 (min_x, min_y, max_x, max_y)
700}