omnivore_core/
table_extractor.rs1use scraper::{Html, Selector, ElementRef};
2use serde::{Deserialize, Serialize};
3
4#[derive(Debug, Clone, Serialize, Deserialize)]
5pub struct TableData {
6 pub headers: Vec<String>,
7 pub rows: Vec<Vec<String>>,
8 pub title: Option<String>,
9 pub caption: Option<String>,
10 pub footnotes: Vec<String>,
11}
12
13impl TableData {
14 pub fn to_csv(&self) -> String {
15 let mut csv = String::new();
16
17 if let Some(ref title) = self.title {
19 csv.push_str(&format!("# {}\n", title));
20 }
21
22 if !self.headers.is_empty() {
24 csv.push_str(&self.headers.join(","));
25 csv.push('\n');
26 }
27
28 for row in &self.rows {
30 let escaped_row: Vec<String> = row.iter().map(|cell| {
31 if cell.contains(',') || cell.contains('"') || cell.contains('\n') {
32 format!("\"{}\"", cell.replace('"', "\"\""))
33 } else {
34 cell.clone()
35 }
36 }).collect();
37 csv.push_str(&escaped_row.join(","));
38 csv.push('\n');
39 }
40
41 if !self.footnotes.is_empty() {
43 csv.push_str("\n# Footnotes:\n");
44 for footnote in &self.footnotes {
45 csv.push_str(&format!("# {}\n", footnote));
46 }
47 }
48
49 csv
50 }
51}
52
53pub struct TableExtractor {
54 min_rows: usize,
55 extract_footnotes: bool,
56}
57
58impl TableExtractor {
59 pub fn new() -> Self {
60 Self {
61 min_rows: 1,
62 extract_footnotes: true,
63 }
64 }
65
66 pub fn extract_tables(&self, html: &str) -> Vec<TableData> {
67 let document = Html::parse_document(html);
68 let mut tables = Vec::new();
69
70 let table_selector = Selector::parse("table").unwrap();
72
73 for (idx, table_element) in document.select(&table_selector).enumerate() {
74 if let Some(table_data) = self.parse_table(table_element, idx) {
75 if table_data.rows.len() >= self.min_rows {
76 tables.push(table_data);
77 }
78 }
79 }
80
81 tables
82 }
83
84 fn parse_table(&self, table: ElementRef, table_idx: usize) -> Option<TableData> {
85 let mut headers = Vec::new();
86 let mut rows = Vec::new();
87 let mut caption = None;
88 let mut title = None;
89 let mut footnotes = Vec::new();
90
91 if let Ok(caption_selector) = Selector::parse("caption") {
93 if let Some(caption_element) = table.select(&caption_selector).next() {
94 caption = Some(self.clean_text(&caption_element.text().collect::<String>()));
95 }
96 }
97
98 let _ = self.find_table_title(table).or(caption.clone());
100
101 if let Ok(thead_selector) = Selector::parse("thead") {
103 if let Some(thead) = table.select(&thead_selector).next() {
104 headers = self.extract_headers_from_element(thead);
105 }
106 }
107
108 if headers.is_empty() {
110 if let Ok(tr_selector) = Selector::parse("tr") {
111 if let Some(first_row) = table.select(&tr_selector).next() {
112 let th_selector = Selector::parse("th").unwrap();
113 let th_count = first_row.select(&th_selector).count();
114
115 if th_count > 0 {
116 headers = self.extract_headers_from_element(first_row);
117 }
118 }
119 }
120 }
121
122 let tbody_selector = Selector::parse("tbody").unwrap_or_else(|_| Selector::parse("*").unwrap());
124 let tbody = table.select(&tbody_selector).next().unwrap_or(table);
125
126 let tr_selector = Selector::parse("tr").unwrap();
127 for row_element in tbody.select(&tr_selector) {
128 let row = self.extract_row(row_element);
129
130 if !headers.is_empty() && row == headers {
132 continue;
133 }
134
135 if !row.is_empty() && row.iter().any(|cell| !cell.trim().is_empty()) {
137 rows.push(row);
138 }
139 }
140
141 if headers.is_empty() && !rows.is_empty() {
143 if self.looks_like_header(&rows[0]) {
145 headers = rows.remove(0);
146 }
147 }
148
149 if self.extract_footnotes {
151 footnotes = self.extract_table_footnotes(table);
152 }
153
154 if title.is_none() && (!headers.is_empty() || !rows.is_empty()) {
156 title = Some(format!("Table {}", table_idx + 1));
157 }
158
159 if !rows.is_empty() || !headers.is_empty() {
160 Some(TableData {
161 headers,
162 rows,
163 title,
164 caption,
165 footnotes,
166 })
167 } else {
168 None
169 }
170 }
171
172 fn extract_headers_from_element(&self, element: ElementRef) -> Vec<String> {
173 let mut headers = Vec::new();
174
175 let th_selector = Selector::parse("th").unwrap();
177 for th in element.select(&th_selector) {
178 headers.push(self.clean_text(&th.text().collect::<String>()));
179 }
180
181 if headers.is_empty() {
183 let td_selector = Selector::parse("td").unwrap();
184 for td in element.select(&td_selector) {
185 headers.push(self.clean_text(&td.text().collect::<String>()));
186 }
187 }
188
189 headers
190 }
191
192 fn extract_row(&self, row: ElementRef) -> Vec<String> {
193 let mut cells = Vec::new();
194
195 let cell_selector = Selector::parse("th, td").unwrap();
197
198 for cell in row.select(&cell_selector) {
199 let text = self.clean_text(&cell.text().collect::<String>());
200
201 let colspan = cell.value().attr("colspan")
203 .and_then(|v| v.parse::<usize>().ok())
204 .unwrap_or(1);
205
206 for _ in 0..colspan {
207 cells.push(text.clone());
208 }
209 }
210
211 cells
212 }
213
214 fn find_table_title(&self, table: ElementRef) -> Option<String> {
215 if let Some(id) = table.value().attr("id") {
219 if !id.is_empty() {
220 return Some(self.humanize_identifier(id));
221 }
222 }
223
224 if let Some(class) = table.value().attr("class") {
225 if class.contains("admissions") || class.contains("scores") || class.contains("demographics") {
226 return Some(self.humanize_identifier(class));
227 }
228 }
229
230 None
233 }
234
235 fn extract_table_footnotes(&self, table: ElementRef) -> Vec<String> {
236 let mut footnotes = Vec::new();
237
238 let footnote_selectors = vec![
240 ".footnote",
241 ".table-footnote",
242 "tfoot",
243 "tr.footnote",
244 "td[colspan]",
245 ];
246
247 for selector_str in footnote_selectors {
248 if let Ok(selector) = Selector::parse(selector_str) {
249 for element in table.select(&selector) {
250 let text = self.clean_text(&element.text().collect::<String>());
251
252 if text.starts_with('*') || text.starts_with('†') || text.starts_with('‡')
254 || text.starts_with('§') || text.starts_with('¶') || text.starts_with('#')
255 || text.chars().next().map_or(false, |c| c.is_ascii_digit()) {
256
257 if !text.is_empty() && text.len() > 5 { footnotes.push(text);
259 }
260 }
261 }
262 }
263 }
264
265 let tr_selector = Selector::parse("tr").unwrap();
267 for row in table.select(&tr_selector) {
268 let td_selector = Selector::parse("td").unwrap();
269 let cells: Vec<_> = row.select(&td_selector).collect();
270
271 if cells.len() == 1 {
272 if let Some(cell) = cells.first() {
273 if let Some(colspan) = cell.value().attr("colspan") {
274 if colspan.parse::<usize>().unwrap_or(1) > 3 {
275 let text = self.clean_text(&cell.text().collect::<String>());
276 if text.len() > 10 && !self.looks_like_header(&[text.clone()]) {
277 footnotes.push(text);
278 }
279 }
280 }
281 }
282 }
283 }
284
285 footnotes
286 }
287
288 fn looks_like_header(&self, row: &[String]) -> bool {
289 for cell in row {
291 let lower = cell.to_lowercase();
292
293 if lower.contains("year") || lower.contains("total") || lower.contains("count")
295 || lower.contains("name") || lower.contains("date") || lower.contains("score")
296 || lower.contains("gpa") || lower.contains("average") || lower.contains("median")
297 || lower.contains("applications") || lower.contains("accepts") || lower.contains("offers") {
298 return true;
299 }
300
301 if cell.chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == '%') {
303 return false;
304 }
305 }
306
307 let short_caps = row.iter().filter(|cell| {
309 cell.len() < 20 && cell.chars().next().map_or(false, |c| c.is_uppercase())
310 }).count();
311
312 short_caps > row.len() / 2
313 }
314
315 fn clean_text(&self, text: &str) -> String {
316 text.trim()
317 .replace('\n', " ")
318 .replace('\t', " ")
319 .split_whitespace()
320 .collect::<Vec<_>>()
321 .join(" ")
322 }
323
324 fn humanize_identifier(&self, identifier: &str) -> String {
325 identifier
326 .replace('_', " ")
327 .replace('-', " ")
328 .split_whitespace()
329 .map(|word| {
330 let mut chars = word.chars();
331 match chars.next() {
332 None => String::new(),
333 Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
334 }
335 })
336 .collect::<Vec<_>>()
337 .join(" ")
338 }
339}