mod layout;
mod multi_page;
mod pdf_extractor;
mod storage;
mod types;
pub use layout::{LineSegment, PageLayout, TextBox, cluster_values, extract_pdf_layout};
pub use multi_page::{find_continuation_candidates, merge_multi_page_tables};
pub use pdf_extractor::extract_tables_from_pdf;
pub use storage::{
TABLE_META_KIND, TABLE_ROW_KIND, TABLE_TRACK, export_to_csv, export_to_json, get_table,
list_tables, store_table, store_table_with_embedder,
};
pub use types::{
DetectionMode, ExtractedTable, ExtractionMode, TableCell, TableExtractionOptions,
TableExtractionOptionsBuilder, TableExtractionResult, TableQuality, TableRow, TableSummary,
};
use crate::error::Result;
pub fn extract_tables(
bytes: &[u8],
filename: &str,
options: &TableExtractionOptions,
) -> Result<TableExtractionResult> {
let lower = filename.to_lowercase();
if lower.ends_with(".pdf") || is_pdf_magic(bytes) {
extract_tables_from_pdf(bytes, filename, options)
} else if lower.ends_with(".xlsx") || lower.ends_with(".xls") {
Ok(TableExtractionResult::empty())
} else if lower.ends_with(".docx") || lower.ends_with(".doc") {
Ok(TableExtractionResult::empty())
} else if lower.ends_with(".html") || lower.ends_with(".htm") {
Ok(TableExtractionResult::empty())
} else {
Ok(TableExtractionResult::empty())
}
}
fn is_pdf_magic(bytes: &[u8]) -> bool {
let trimmed = bytes
.iter()
.skip_while(|&&b| b == 0xEF || b == 0xBB || b == 0xBF || b.is_ascii_whitespace())
.take(4)
.copied()
.collect::<Vec<_>>();
trimmed.starts_with(b"%PDF")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_magic_detection() {
assert!(is_pdf_magic(b"%PDF-1.4"));
assert!(is_pdf_magic(b"\xEF\xBB\xBF%PDF-1.7")); assert!(is_pdf_magic(b" %PDF-1.5")); assert!(!is_pdf_magic(b"PK\x03\x04")); assert!(!is_pdf_magic(b"<html>"));
}
#[test]
fn test_extraction_options_builder() {
let options = TableExtractionOptions::builder()
.mode(ExtractionMode::LatticeOnly)
.min_rows(3)
.min_cols(2)
.min_quality(TableQuality::High)
.merge_multi_page(false)
.max_pages(10)
.build();
assert_eq!(options.mode, ExtractionMode::LatticeOnly);
assert_eq!(options.min_rows, 3);
assert_eq!(options.min_cols, 2);
assert_eq!(options.min_quality, TableQuality::High);
assert!(!options.merge_multi_page);
assert_eq!(options.max_pages, 10);
}
#[test]
fn test_default_options() {
let options = TableExtractionOptions::default();
assert_eq!(options.mode, ExtractionMode::Conservative);
assert_eq!(options.min_rows, 2);
assert_eq!(options.min_cols, 2);
assert_eq!(options.min_quality, TableQuality::Medium);
assert!(options.merge_multi_page);
assert_eq!(options.max_pages, 0);
}
}