use super::OxideDocument;
use crate::pdf::error::{PdfError, Result};
use crate::types::{BoundingBox, Table};
pub(crate) fn extract_tables_native(doc: &mut OxideDocument) -> Result<Vec<Table>> {
let page_count = doc
.doc
.page_count()
.map_err(|e| PdfError::MetadataExtractionFailed(format!("pdf_oxide: failed to get page count: {e}")))?;
let config = pdf_oxide::structure::spatial_table_detector::TableDetectionConfig::strict();
let mut all_tables = Vec::new();
for page_idx in 0..page_count {
let extracted = match doc.doc.extract_tables_with_config(page_idx, config.clone()) {
Ok(tables) => tables,
Err(e) => {
tracing::debug!(page = page_idx, "pdf_oxide extract_tables failed: {e}");
continue;
}
};
let page_number = page_idx + 1;
for extracted_table in extracted {
if extracted_table.rows.is_empty() || extracted_table.col_count == 0 {
continue;
}
let (cells, markdown) = convert_extracted_table(&extracted_table);
if cells.is_empty() || markdown.trim().is_empty() {
continue;
}
let bounding_box = extracted_table.bbox.map(|rect| BoundingBox {
x0: rect.x as f64,
y0: rect.y as f64,
x1: (rect.x + rect.width) as f64,
y1: (rect.y + rect.height) as f64,
});
all_tables.push(Table {
cells,
markdown,
page_number,
bounding_box,
});
}
}
Ok(all_tables)
}
fn convert_extracted_table(
table: &pdf_oxide::structure::table_extractor::ExtractedTable,
) -> (Vec<Vec<String>>, String) {
let mut cells: Vec<Vec<String>> = Vec::with_capacity(table.rows.len());
let mut markdown = String::new();
let mut found_header = false;
for (row_idx, row) in table.rows.iter().enumerate() {
let row_cells: Vec<String> = row.cells.iter().map(|cell| cell.text.trim().to_string()).collect();
markdown.push('|');
for cell in &row_cells {
markdown.push(' ');
markdown.push_str(cell);
markdown.push_str(" |");
}
markdown.push('\n');
if row.is_header && !found_header {
found_header = true;
markdown.push('|');
for _ in &row_cells {
markdown.push_str(" --- |");
}
markdown.push('\n');
} else if row_idx == 0 && !found_header {
found_header = true;
markdown.push('|');
for _ in &row_cells {
markdown.push_str(" --- |");
}
markdown.push('\n');
}
cells.push(row_cells);
}
(cells, markdown)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_extracted_table_basic() {
use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
let table = ExtractedTable {
rows: vec![
TableRow {
cells: vec![
TableCell {
text: "Name".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: true,
},
TableCell {
text: "Age".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: true,
},
],
is_header: true,
},
TableRow {
cells: vec![
TableCell {
text: "Alice".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: false,
},
TableCell {
text: "30".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: false,
},
],
is_header: false,
},
],
has_header: true,
col_count: 2,
bbox: None,
};
let (cells, markdown) = convert_extracted_table(&table);
assert_eq!(cells.len(), 2);
assert_eq!(cells[0], vec!["Name", "Age"]);
assert_eq!(cells[1], vec!["Alice", "30"]);
assert!(markdown.contains("| Name | Age |"));
assert!(markdown.contains("| --- | --- |"));
assert!(markdown.contains("| Alice | 30 |"));
}
#[test]
fn test_convert_extracted_table_no_header() {
use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
let table = ExtractedTable {
rows: vec![
TableRow {
cells: vec![TableCell {
text: "A".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: false,
}],
is_header: false,
},
TableRow {
cells: vec![TableCell {
text: "B".to_string(),
colspan: 1,
rowspan: 1,
mcids: vec![],
bbox: None,
is_header: false,
}],
is_header: false,
},
],
has_header: false,
col_count: 1,
bbox: None,
};
let (cells, markdown) = convert_extracted_table(&table);
assert_eq!(cells.len(), 2);
assert!(markdown.contains("| --- |"));
}
#[test]
fn test_convert_extracted_table_empty() {
use pdf_oxide::structure::table_extractor::ExtractedTable;
let table = ExtractedTable {
rows: vec![],
has_header: false,
col_count: 0,
bbox: None,
};
let (cells, markdown) = convert_extracted_table(&table);
assert!(cells.is_empty());
assert!(markdown.is_empty());
}
}