omnivore_core/
table_extractor.rs

1use scraper::{Html, Selector, ElementRef};
2use serde::{Deserialize, Serialize};
3
4#[derive(Debug, Clone, Serialize, Deserialize)]
5pub struct TableData {
6    pub headers: Vec<String>,
7    pub rows: Vec<Vec<String>>,
8    pub title: Option<String>,
9    pub caption: Option<String>,
10    pub footnotes: Vec<String>,
11}
12
13impl TableData {
14    pub fn to_csv(&self) -> String {
15        let mut csv = String::new();
16        
17        // Add title as comment if present
18        if let Some(ref title) = self.title {
19            csv.push_str(&format!("# {}\n", title));
20        }
21        
22        // Add headers
23        if !self.headers.is_empty() {
24            csv.push_str(&self.headers.join(","));
25            csv.push('\n');
26        }
27        
28        // Add rows
29        for row in &self.rows {
30            let escaped_row: Vec<String> = row.iter().map(|cell| {
31                if cell.contains(',') || cell.contains('"') || cell.contains('\n') {
32                    format!("\"{}\"", cell.replace('"', "\"\""))
33                } else {
34                    cell.clone()
35                }
36            }).collect();
37            csv.push_str(&escaped_row.join(","));
38            csv.push('\n');
39        }
40        
41        // Add footnotes as comments
42        if !self.footnotes.is_empty() {
43            csv.push_str("\n# Footnotes:\n");
44            for footnote in &self.footnotes {
45                csv.push_str(&format!("# {}\n", footnote));
46            }
47        }
48        
49        csv
50    }
51}
52
53pub struct TableExtractor {
54    min_rows: usize,
55    extract_footnotes: bool,
56}
57
58impl TableExtractor {
59    pub fn new() -> Self {
60        Self {
61            min_rows: 1,
62            extract_footnotes: true,
63        }
64    }
65    
66    pub fn extract_tables(&self, html: &str) -> Vec<TableData> {
67        let document = Html::parse_document(html);
68        let mut tables = Vec::new();
69        
70        // Find all table elements
71        let table_selector = Selector::parse("table").unwrap();
72        
73        for (idx, table_element) in document.select(&table_selector).enumerate() {
74            if let Some(table_data) = self.parse_table(table_element, idx) {
75                if table_data.rows.len() >= self.min_rows {
76                    tables.push(table_data);
77                }
78            }
79        }
80        
81        tables
82    }
83    
84    fn parse_table(&self, table: ElementRef, table_idx: usize) -> Option<TableData> {
85        let mut headers = Vec::new();
86        let mut rows = Vec::new();
87        let mut caption = None;
88        let mut title = None;
89        let mut footnotes = Vec::new();
90        
91        // Extract caption
92        if let Ok(caption_selector) = Selector::parse("caption") {
93            if let Some(caption_element) = table.select(&caption_selector).next() {
94                caption = Some(self.clean_text(&caption_element.text().collect::<String>()));
95            }
96        }
97        
98        // Look for title in previous sibling or parent
99        let _ = self.find_table_title(table).or(caption.clone());
100        
101        // Extract headers from thead or first row with th elements
102        if let Ok(thead_selector) = Selector::parse("thead") {
103            if let Some(thead) = table.select(&thead_selector).next() {
104                headers = self.extract_headers_from_element(thead);
105            }
106        }
107        
108        // If no thead, look for th elements in first row
109        if headers.is_empty() {
110            if let Ok(tr_selector) = Selector::parse("tr") {
111                if let Some(first_row) = table.select(&tr_selector).next() {
112                    let th_selector = Selector::parse("th").unwrap();
113                    let th_count = first_row.select(&th_selector).count();
114                    
115                    if th_count > 0 {
116                        headers = self.extract_headers_from_element(first_row);
117                    }
118                }
119            }
120        }
121        
122        // Extract data rows
123        let tbody_selector = Selector::parse("tbody").unwrap_or_else(|_| Selector::parse("*").unwrap());
124        let tbody = table.select(&tbody_selector).next().unwrap_or(table);
125        
126        let tr_selector = Selector::parse("tr").unwrap();
127        for row_element in tbody.select(&tr_selector) {
128            let row = self.extract_row(row_element);
129            
130            // Skip if this looks like a header row we already processed
131            if !headers.is_empty() && row == headers {
132                continue;
133            }
134            
135            // Skip empty rows
136            if !row.is_empty() && row.iter().any(|cell| !cell.trim().is_empty()) {
137                rows.push(row);
138            }
139        }
140        
141        // If no explicit headers found but we have rows, use first row as headers
142        if headers.is_empty() && !rows.is_empty() {
143            // Check if first row looks like headers (no numbers, common header words)
144            if self.looks_like_header(&rows[0]) {
145                headers = rows.remove(0);
146            }
147        }
148        
149        // Extract footnotes
150        if self.extract_footnotes {
151            footnotes = self.extract_table_footnotes(table);
152        }
153        
154        // Generate default title if none found
155        if title.is_none() && (!headers.is_empty() || !rows.is_empty()) {
156            title = Some(format!("Table {}", table_idx + 1));
157        }
158        
159        if !rows.is_empty() || !headers.is_empty() {
160            Some(TableData {
161                headers,
162                rows,
163                title,
164                caption,
165                footnotes,
166            })
167        } else {
168            None
169        }
170    }
171    
172    fn extract_headers_from_element(&self, element: ElementRef) -> Vec<String> {
173        let mut headers = Vec::new();
174        
175        // Try th elements first
176        let th_selector = Selector::parse("th").unwrap();
177        for th in element.select(&th_selector) {
178            headers.push(self.clean_text(&th.text().collect::<String>()));
179        }
180        
181        // If no th elements, try td elements
182        if headers.is_empty() {
183            let td_selector = Selector::parse("td").unwrap();
184            for td in element.select(&td_selector) {
185                headers.push(self.clean_text(&td.text().collect::<String>()));
186            }
187        }
188        
189        headers
190    }
191    
192    fn extract_row(&self, row: ElementRef) -> Vec<String> {
193        let mut cells = Vec::new();
194        
195        // Extract both th and td elements (some tables use th in data rows)
196        let cell_selector = Selector::parse("th, td").unwrap();
197        
198        for cell in row.select(&cell_selector) {
199            let text = self.clean_text(&cell.text().collect::<String>());
200            
201            // Handle colspan by duplicating the value
202            let colspan = cell.value().attr("colspan")
203                .and_then(|v| v.parse::<usize>().ok())
204                .unwrap_or(1);
205            
206            for _ in 0..colspan {
207                cells.push(text.clone());
208            }
209        }
210        
211        cells
212    }
213    
214    fn find_table_title(&self, table: ElementRef) -> Option<String> {
215        // Look for common title patterns near the table
216        
217        // Check for id or class attributes that might indicate the table's purpose
218        if let Some(id) = table.value().attr("id") {
219            if !id.is_empty() {
220                return Some(self.humanize_identifier(id));
221            }
222        }
223        
224        if let Some(class) = table.value().attr("class") {
225            if class.contains("admissions") || class.contains("scores") || class.contains("demographics") {
226                return Some(self.humanize_identifier(class));
227            }
228        }
229        
230        // Look for heading above the table (simplified approach)
231        // In a real implementation, we'd traverse the DOM properly
232        None
233    }
234    
235    fn extract_table_footnotes(&self, table: ElementRef) -> Vec<String> {
236        let mut footnotes = Vec::new();
237        
238        // Look for common footnote patterns
239        let footnote_selectors = vec![
240            ".footnote",
241            ".table-footnote",
242            "tfoot",
243            "tr.footnote",
244            "td[colspan]",
245        ];
246        
247        for selector_str in footnote_selectors {
248            if let Ok(selector) = Selector::parse(selector_str) {
249                for element in table.select(&selector) {
250                    let text = self.clean_text(&element.text().collect::<String>());
251                    
252                    // Check if it looks like a footnote (starts with *, †, ‡, §, ¶, #, or number)
253                    if text.starts_with('*') || text.starts_with('†') || text.starts_with('‡') 
254                        || text.starts_with('§') || text.starts_with('¶') || text.starts_with('#')
255                        || text.chars().next().map_or(false, |c| c.is_ascii_digit()) {
256                        
257                        if !text.is_empty() && text.len() > 5 { // Minimum footnote length
258                            footnotes.push(text);
259                        }
260                    }
261                }
262            }
263        }
264        
265        // Also check for cells that span all columns (often used for footnotes)
266        let tr_selector = Selector::parse("tr").unwrap();
267        for row in table.select(&tr_selector) {
268            let td_selector = Selector::parse("td").unwrap();
269            let cells: Vec<_> = row.select(&td_selector).collect();
270            
271            if cells.len() == 1 {
272                if let Some(cell) = cells.first() {
273                    if let Some(colspan) = cell.value().attr("colspan") {
274                        if colspan.parse::<usize>().unwrap_or(1) > 3 {
275                            let text = self.clean_text(&cell.text().collect::<String>());
276                            if text.len() > 10 && !self.looks_like_header(&[text.clone()]) {
277                                footnotes.push(text);
278                            }
279                        }
280                    }
281                }
282            }
283        }
284        
285        footnotes
286    }
287    
288    fn looks_like_header(&self, row: &[String]) -> bool {
289        // Check if row looks like headers
290        for cell in row {
291            let lower = cell.to_lowercase();
292            
293            // Common header keywords
294            if lower.contains("year") || lower.contains("total") || lower.contains("count")
295                || lower.contains("name") || lower.contains("date") || lower.contains("score")
296                || lower.contains("gpa") || lower.contains("average") || lower.contains("median")
297                || lower.contains("applications") || lower.contains("accepts") || lower.contains("offers") {
298                return true;
299            }
300            
301            // If it's all numbers, probably not a header
302            if cell.chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == '%') {
303                return false;
304            }
305        }
306        
307        // If most cells are short and capitalized, probably headers
308        let short_caps = row.iter().filter(|cell| {
309            cell.len() < 20 && cell.chars().next().map_or(false, |c| c.is_uppercase())
310        }).count();
311        
312        short_caps > row.len() / 2
313    }
314    
315    fn clean_text(&self, text: &str) -> String {
316        text.trim()
317            .replace('\n', " ")
318            .replace('\t', " ")
319            .split_whitespace()
320            .collect::<Vec<_>>()
321            .join(" ")
322    }
323    
324    fn humanize_identifier(&self, identifier: &str) -> String {
325        identifier
326            .replace('_', " ")
327            .replace('-', " ")
328            .split_whitespace()
329            .map(|word| {
330                let mut chars = word.chars();
331                match chars.next() {
332                    None => String::new(),
333                    Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
334                }
335            })
336            .collect::<Vec<_>>()
337            .join(" ")
338    }
339}